JavaScript Performance Through the Spyglass

Vyacheslav Egorov

Excelsior JET

V8

Dart VM

LuaJIT

Excelsior JET

V8

Dart VM

LuaJIT

complexity

no choice

[ask on StackOverflow]

profile

and/or

read code

Crankshaft ⇒ IRHydra

[http://mrale.ph/irhydra/2]

# Get the source as described at:
# https://chromium.googlesource.com/v8/v8.git
$ fetch v8
...
$ cd v8
$ make x64.debug
...

          
# Get the source as described at:
# https://chromium.googlesource.com/v8/v8.git
$ fetch v8
...
$ cd v8
$ make x64.debug
...
$ out/x64.debug/d8 --print-ast test.js
          
. . FOR at -1
. . . YIELD COUNT 0
. . . BODY at -1
. . . . BLOCK at -1
. . . . . BLOCK NOCOMPLETIONS at -1
. . . . . . EXPRESSION STATEMENT at -1
. . . . . . . INIT at -1
. . . . . . . . VAR PROXY local[0] (mode = LET) "i"
. . . . . . . . VAR PROXY local[2] (mode = TEMPORARY) ".for"
. . . . . . IF at -1
. . . . . . . CONDITION at -1
. . . . . . . . EQ at -1
. . . . . . . . . VAR PROXY local[3] (mode = TEMPORARY) ".for"
. . . . . . . . . LITERAL 1
. . . . . . . THEN at -1
. . . . . . . . EXPRESSION STATEMENT at -1
. . . . . . . . . ASSIGN at -1
. . . . . . . . . . VAR PROXY local[3] (mode = TEMPORARY) ".for"
. . . . . . . . . . LITERAL 0
. . . . . . . ELSE at 200
. . . . . . . . EXPRESSION STATEMENT at 200
. . . . . . . . . POST INC at 200
. . . . . . . . . . VAR PROXY local[0] (mode = LET) "i"
. . . . . . EXPRESSION STATEMENT at -1
. . . . . . . ASSIGN at -1
. . . . . . . . VAR PROXY local[4] (mode = TEMPORARY) ".for"
. . . . . . . . LITERAL 1
. . . . . . IF at 186
. . . . . . . CONDITION at 186
. . . . . . . . LT at 186
. . . . . . . . . VAR PROXY local[0] (mode = LET) "i"
. . . . . . . . . LITERAL 1e+08
. . . . . . . THEN at -1
. . . . . . . . EMPTY at -1
. . . . . . . ELSE at -1
. . . . . . . . BREAK at -1
. . . . . FOR at 168
. . . . . . YIELD COUNT 0
. . . . . . COND at -1
. . . . . . . EQ at -1
. . . . . . . . VAR PROXY local[4] (mode = TEMPORARY) ".for"
        
VAR PROXY local[0] (mode = LET) "i"
VAR PROXY local[2] (mode = TEMPORARY) ".for"
        
local[0] (mode = LET) "i"
local[2] (mode = TEMPORARY) ".for"
        
$ git grep '"\.for"'
src/ast/ast-value-factory.h:
      F(dot_for, ".for")                            \
$ git grep dot_for
src/parsing/parser.cc:  const AstRawString* temp_name = ast_value_factory()->dot_for_string();
src/parsing/parser.cc:            scope_->NewTemporary(ast_value_factory()->dot_for_string());
src/parsing/parser.cc:          scope_->NewTemporary(ast_value_factory()->dot_for_string());

  // We are given a for statement
  // of the form
  //
  //  labels: for (let/const x = i;
  //               cond;
  //               next) body
  //
  // and rewrite it as follows.
        

  // We are given a for statement of the form
  //
  //  labels: for (let/const x = i; cond; next) body
  //
  // and rewrite it as follows.  Here we write {{ ... }} for init-blocks, ie.,
  // blocks whose ignore_completion_value_ flag is set.
  //
  //  {
  //    let/const x = i;
  //    temp_x = x;
  //    first = 1;
  //    undefined;
  //    outer: for (;;) {
  //      let/const x = temp_x;
  //      {{ if (first == 1) {
  //           first = 0;
  //         } else {
  //           next;
  //         }
  //         flag = 1;
  //         if (!cond) break;
  //      }}
  //      labels: for (; flag == 1; flag = 0, temp_x = x) {
  //        body
  //      }
  //      {{ if (flag == 1)  // Body used break.
  //           break;
  //      }}
  //    }
  //  }
        

internals are important

internals are irrelevant



function find(val){
  function index(value) {
    return [1, 2, 3].indexOf(value);
  }

  return index(val);
}

          


function find(val){
  function index(value) {
    return [1, 2, 3].indexOf(value);
  }

  return index(val);
}
// => 5464 op/ms
          

var arr = [1, 2, 3];
function find(val){
  function index(value) {
    return arr.indexOf(value);
  }

  return index(val);
}

          

var arr = [1, 2, 3];
function find(val){
  function index(value) {
    return arr.indexOf(value);
  }

  return index(val);
}
// => 14084 op/ms
          

«array allocation
is expensive!»


var makeArr = () => [1, 2, 3];
function find(val){
  function index(value) {
    return makeArr().indexOf(value);
  }

  return index(val);
}

          

var makeArr = () => [1, 2, 3];
function find(val){
  function index(value) {
    return makeArr().indexOf(value);
  }

  return index(val);
}
// => 12048 op/ms
          

var makeArr = () => [1, 2, 3];
function find(val){
  function index(value) {
    return makeArr().indexOf(value);
  }

  return index(val);
}
// => 12048 op/ms
          
$ perf record d8 --perf-basic-prof test.js
$ perf report
          
14.80% v8::internal::Factory::NewFunction
 9.98% v8::internal::Factory::NewFunctionFromSharedFunctionInfo
 8.74% *InnerArrayIndexOf native array.js:1019
 7.24% v8::internal::Factory::NewFunctionFromSharedFunctionInfo
 5.79% v8::internal::Runtime_NewClosure
 4.84% Builtin:ArgumentsAdaptorTrampoline
 4.44% v8::internal::Heap::AllocateRaw
 4.30% v8::internal::Factory::New
 3.74% v8::internal::SharedFunctionInfo::SearchOptimizedCodeMap
 3.20% ~index test.js:2
          
14.80% v8::internal::Factory::NewFunction
 9.98% v8::internal::Factory::NewFunctionFromSharedFunctionInfo
 8.74% *InnerArrayIndexOf native array.js:1019
 7.24% v8::internal::Factory::NewFunctionFromSharedFunctionInfo
 5.79% v8::internal::Runtime_NewClosure
 4.84% Builtin:ArgumentsAdaptorTrampoline
 4.44% v8::internal::Heap::AllocateRaw
 4.30% v8::internal::Factory::New<v8::internal::JSFunction>
 3.74% v8::internal::SharedFunctionInfo::SearchOptimizedCodeMap
 3.20% ~index test.js:2
          
14.49% *InnerArrayIndexOf native array.js:1019
10.47% LoadIC:A load IC from the snapshot
 8.45% ~index b2.js:12
 8.05% Stub:FastNewClosureStub
 5.77% Builtin:ArgumentsAdaptorTrampoline
 5.23% *find2 b2.js:11
          
14.49% *InnerArrayIndexOf native array.js:1019
10.47% LoadIC:A load IC from the snapshot
 8.45% ~index b2.js:12
 8.05% Stub:FastNewClosureStub
 5.77% Builtin:ArgumentsAdaptorTrampoline
 5.23% *find2 b2.js:11
          

FastNewClosureStub
did not support allocation of functions with literals inside

[now it actually can, example is from February 2016]

ChaCha20


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    quarterRound(x, 0, 4, 8,12);
    quarterRound(x, 1, 5, 9,13);
    quarterRound(x, 2, 6,10,14);
    quarterRound(x, 3, 7,11,15);
    quarterRound(x, 0, 5,10,15);
    quarterRound(x, 1, 6,11,12);
    quarterRound(x, 2, 7, 8,13);
    quarterRound(x, 3, 4, 9,14);
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

19MB/s


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    quarterRound(x, 0, 4, 8,12);
    quarterRound(x, 1, 5, 9,13);
    quarterRound(x, 2, 6,10,14);
    quarterRound(x, 3, 7,11,15);
    quarterRound(x, 0, 5,10,15);
    quarterRound(x, 1, 6,11,12);
    quarterRound(x, 2, 7, 8,13);
    quarterRound(x, 3, 4, 9,14);
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // quarterRound(x, 0, 4, 8,12);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) <<  8) | ((x[12] ^ x[ 0]) >>> 24);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) <<  7) | ((x[ 4] ^ x[ 8]) >>> 25);
    // ... so on ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

2MB/s

«I HEARD INLINE
MUST IMPROVE PERFORMANCE!»


function U32TO8_LE(x, i, u) {
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

function U32TO8_LE(x, i, u) {
    // ____     ___  ____   __________              _        ____   ___       ___
    // `MM'     `M' 6MMMMb\ `MMMMMMMMM             dM.      6MMMMb\ `MMb     dMM'
    //  MM       M 6M'    `  MM      \            ,MMb     6M'    `  MMM.   ,PMM
    //  MM       M MM        MM                   d'YM.    MM        M`Mb   d'MM
    //  MM       M YM.       MM    ,             ,P `Mb    YM.       M YM. ,P MM
    //  MM       M  YMMMMb   MMMMMMM             d'  YM.    YMMMMb   M `Mb d' MM
    //  MM       M      `Mb  MM    `            ,P   `Mb        `Mb  M  YM.P  MM
    //  MM       M       MM  MM                 d'    YM.        MM  M  `Mb'  MM
    //  YM       M       MM  MM                ,MMMMMMMMb        MM  M   YP   MM
    //   8b     d8 L    ,M9  MM      /         d'      YM. L    ,M9  M   `'   MM
    //    YMMMMM9  MYMMMM9  _MMMMMMMMM       _dM_     _dMM_MYMMMM9  _M_      _MM_
    //
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

19MB/s


function U32TO8_LE(x, i, u) {
    // ____     ___  ____   __________              _        ____   ___       ___
    // `MM'     `M' 6MMMMb\ `MMMMMMMMM             dM.      6MMMMb\ `MMb     dMM'
    //  MM       M 6M'    `  MM      \            ,MMb     6M'    `  MMM.   ,PMM
    //  MM       M MM        MM                   d'YM.    MM        M`Mb   d'MM
    //  MM       M YM.       MM    ,             ,P `Mb    YM.       M YM. ,P MM
    //  MM       M  YMMMMb   MMMMMMM             d'  YM.    YMMMMb   M `Mb d' MM
    //  MM       M      `Mb  MM    `            ,P   `Mb        `Mb  M  YM.P  MM
    //  MM       M       MM  MM                 d'    YM.        MM  M  `Mb'  MM
    //  YM       M       MM  MM                ,MMMMMMMMb        MM  M   YP   MM
    //   8b     d8 L    ,M9  MM      /         d'      YM. L    ,M9  M   `'   MM
    //    YMMMMM9  MYMMMM9  _MMMMMMMMM       _dM_     _dMM_MYMMMM9  _M_      _MM_
    // ^^^^^^ Don't remove this comment it makes the code 10x faster. ^^^^^^^^^^^
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

...

we say performance
we mean jsperf.com

we say performance
we mean jsperf.com

[or at least we used to]


// split
str.split("e").length - 1;

// while
var count = 0;
while (str.length) {
  if (str.charAt(0) === "e") {
    count += 1;
  }
  str = str.slice(1);
}
          

// split (IT'S OVER 9000K OP/S ON FF!)
str.split("e").length - 1;

// while
var count = 0;
while (str.length) {
  if (str.charAt(0) === "e") {
    count += 1;
  }
  str = str.slice(1);
}
          

DOUBT

EVERYTHING

µbench 101

the cost of OP?


function benchmark() {
  var start = Date.now();
  /* OP */
  return (Date.now() - start);
}
          

what if OP faster than clock?


function benchmark(N) {
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    /* OP */
  }
  return (Date.now() - start) / N;
}
          

\[C\]

\[C \times N\]

\[\frac{C \times N}{N}\]

\[\frac{C \times \cancel{N}}{\cancel{N}}\]






    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }




function benchmark() {
  var str = "...";
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    var count = 0;
    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var str = "...";
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    var count = 0;
    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }
  }  // here str === ""
  return (Date.now() - start) / N;
}
          

repetitions start with str === ""

\[C \times N\]

\[C \times 1 + 0 \times (N - 1)\]

\[\frac{C \times 1 + 0 \times (N - 1)}{N}\]

\[\frac{C}{N}\]

\[\frac{C}{N} \approx 0\]

benchmark.js misuse

OP should take the same time

when repeatedly executed

«~~ for the win!»

NOPE

JITs are clever

JITs optimize
as program runs

\[C \times N\]

\[C_u N_u + C_o N_o\]

[unoptimized + optimized version]

\[{\sum C_{i} N_i}\]

[multiple unoptimized and
optimized versions]

\[{\sum C_{i} N_i} + \color{#dc1e1a}{\frac{1}{\sqrt{2\pi}}\int C(\xi) e ^ {-\frac{\xi^2}{2}} d\xi}\]

[math gets too complicated to approach analytically]

JITs are clever

to measure you need to outsmart

Disclaimer: On slides I will show optimizations as source-to-source transformations. In reality compilers operate on more sophisticated representations.


function benchmark() {
  var i = '1561565', j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~'1561565';
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~-1561566;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = 1561565;
  }
  return (Date.now() - start) / N;
}
          

constant propagation

«hey, I can trick the compiler!»


function benchmark() {
  var i = Date.now().toString(), j;
  //  ^ not a constant any more
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
  }
  return (Date.now() - start) / N;
}
          

«Wrong answer, McFly!»


function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
    //  ^^^ loop invariant
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  //  ^^^^^^^^ hoisted from the loop
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

loop invariant code motion


function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    /* sound of silence */
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var start = Date.now();
  /*
   * sound of silence
   */
  return (Date.now() - start) / N;
}
          

dead code elimination


// Remember this one?
// split (IT'S OVER 9000K OP/S ON FF!)
str.split("e").length - 1;
// it's full of dead code.

optimizer eats µbenchmarks
for breakfast

chew proof µbenchmarks?

do not try
to write them

  1. verify results
  2. no constants
  3. no loop invariants
  4. no dead code


function benchmark() {
  var i = Date.now().toString(),
      i1 = Date.now().toString(), t;
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    t = i; i = i1; i1 = t;
    j = ~~i;
  }
  if (i != j || i1 != j) throw "whoa?";
  return (Date.now() - start) / N;
}
          

not bad

(potentially)

still not enough


for (var n = 0; n < N; n++) {
  t = i; i = i1; i1 = t;
  j = ~~i;
}
          

for (var n = 0; n < (N / 2); n++) {
  t = i; i = i1; i1 = t;
  j = ~~i;
  t = i; i = i1; i1 = t;
  j = ~~i;              
}
if ((N % 2) === 1) {
  t = i; i = i1; i1 = t;
  j = ~~i;
} 
          

for (var n = 0; n < (N / 2); n++) {
  j = ~~i1;
  t = i1;
  j = ~~i;
}
          

for (var n = 0; n < (N / 2); n++) {
  j = ~~i1; /* dead */
  t = i1;   /* dead */
  j = ~~i;  /* invariant */
}
          

loop unrolling

V8 doesn't do it

but I want to induce paranoia ☺

presumption of performance

reasonable code
reasonably fast

confirmation
bias

«prototype chains are slow»


var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
          

var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
                  // LISP tribute ^^^^^
          

function doManyLookups() {
  var counter = 0;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += obj.prop;
  print('In total: ' + counter);
}
          

function lookupAndCache() {
  var counter = 0;
  var value = obj.prop;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += value;
  print('In total: ' + counter);
}
          

// State of art benchmark driver.
function measure(f) {
  var start = Date.now();
  f();
  var end = Date.now();
  print(f.name + ' took ' +
        (end - start) + ' ms.');
}
          

measure(doManyLookups);
measure(lookupAndCache);
          
$ node prototype.js
In total: 10000000000
doManyLookups took 8243 ms.
In total: 10000000000
lookupAndCache took 1058 ms.

CONFIRMED

lets make it harder


-   Object.create({ prop: 10 })))));
+   Object.create({ get prop () { return 10 } })))));
$ node prototype.js
In total: 10000000000
doManyLookups took 1082 ms.
In total: 10000000000
lookupAndCache took 1061 ms.

«what kind of voodoo is this?»

very old V8 did not inline loads from data properties defined on prototypes

trying newer V8

$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.
$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.

prototype chain traversal got LICMed

... and now for something completely different

what if we run benchmark
twice?


measure(doManyLookups);
measure(doManyLookups);
measure(lookupAndCache);
measure(lookupAndCache);
          
$ d8 prototype.js | grep took
doManyLookups took 1301 ms.
doManyLookups took 3408 ms.
lookupAndCache took 1204 ms.
lookupAndCache took 3406 ms.
        

what just
happened here?

'In total: ' + counter type-feedback leaks upwards into the loop and causes excessive boxing of the counter variable

workaround: hide + from representation inference


-   print('In total: ' + counter);
+   print('In total: ' + counter.toString());
$ d8 prototype.js | grep took
doManyLookups took 1298 ms.
doManyLookups took 1119 ms.
lookupAndCache took 1188 ms.
lookupAndCache took 982 ms.
        

Benchmark.prototype.setup = function () {
  var obj = { }

  var _a = 0;
  Object.defineProperty(obj, 'a', {
    get: function () { return _a; },
    set: function (val) { _a = val; }
  });
};
          

suite.add('Accessor', function () {
  obj.a = 2;
  obj.a;
});
$ v5.3/d8 concat.js
Accessor x 3,840,922 ops/sec ±1.54%
$ v5.3/d8 concat.js
Accessor x 3,840,922 ops/sec ±1.54%
$ v3.31/d8 concat.js
Accessor x 874,731,387 ops/sec ±1.09%
$ v5.3/d8 concat.js
Accessor x 3,840,922 ops/sec ±1.54%
$ v3.31/d8 concat.js
Accessor x 874,731,387 ops/sec ±1.09%

function f() {
  var obj = { }
  var _a = 0;
  Object.defineProperty(obj, 'a', {
    get: function () { return _a; },
    set: function(val) { _a = val; }
  });
  for (var i = 0; i < 1e6; i++) {
    obj.a = 2;
    obj.a;
  }
}

f();  // =>   2ms




f();  // =>   2ms
f();  // => 287ms



f();  // =>   2ms
f();  // => 287ms



Object.defineProperty(obj, 'a', {
  /* ... */
});
print('obj has fast properties: ' +
      %HasFastProperties(obj));

$ d8 --allow-natives-syntax
obj has fast properties: true
f() took 2ms
obj has fast properties: false
f() took 287ms

hidden class transition clash


var obj1 = { }
Object.defineProperty(obj1, 'a', {
  get: function () { /* ... */ },
  set: function(val) { /* ... */ }
});
var obj2 = { }
Object.defineProperty(obj2, 'a', {
  get: function () { /* ... */ },
  set: function(val) { /* ... */ }
});
// Initial hudden class for obj1/obj2 is the same,
// but after defineProperty they have different ones.

Object.defineProperty(obj, 'a', {
  /* ... */
});
Object.create(obj);

f();  // =>   4ms
f();  // =>   3ms




f();  // =>   4ms
f();  // =>   3ms
f();  // =>   3ms
f();  // =>   9ms
f();  // =>  28ms

function g() {
  var obj = { }
  var _a = 0;
  obj.a = {
  get: function () { return _a; },
  set: function(val) { _a = val; }
};

  for (var i = 0; i < 1e6; i++) {
    obj.a.set(2);
    obj.a.get();
  }
}

g();  // =>  23ms
g();  // =>  15ms
g();  // =>  16ms
g();  // =>  22ms
g();  // =>  16ms

function Box(_a) {
  this._a = _a;
}

Box.prototype.get = function () {
  return this._a;
};

Box.prototype.set = function (val) {
  this._a = val;
};

function h() {
  var obj = { }
  obj.a = new Box(0);
  for (var i = 0; i < 1e6; i++) {
    obj.a.set(2);
    obj.a.get();
  }
}

h();  // =>   2ms
h();  // =>   1ms
h();  // =>   0ms
h();  // =>   1ms
h();  // =>   1ms

know fundamentals!

we can look into other VMs!


$ jsc test.js
f();  // =>   3ms
f();  // =>   3ms
f();  // =>   16ms
f();  // =>   17ms
f();  // =>   11ms

$ jsc --useConcurrentJIT=false \
      --showDisassembly=true   \
      test.js >& code.asm


$ jsc --useConcurrentJIT=false \
      --showDisassembly=true   \
      test.js >& code.asm
# there'll be three DFG versions of function f
82:<!7:loc14> GetLocal(Untyped:@178, JS|MustGen|UseAsOther, Final, loc6(O<Final>/FlushedCell), machine:loc5, R:Stack(-7), bc#102)  predicting Final
    0x22d0b7400516: mov -0x30(%rbp), %rax
84:<!0:-> CheckStructure(Cell:@82, MustGen, [%CS:Object], R:JSCell_structureID, Exits, bc#102)
    0x22d0b740051a: cmp $0xff, (%rax)
    0x22d0b7400520: jnz 0x22d0b7400871
85:< 2:loc13> GetGetterSetterByOffset(KnownCell:@82, KnownCell:@82, JS|UseAsOther, Othercell, id5{a}, 0, inferredType = Top, R:NamedProperties(5), Exits, bc#102)
    0x22d0b7400526: mov 0x10(%rax), %rsi
86:< 1:loc9>  GetSetter(KnownCell:@85, JS|UseAsOther, Function, R:GetterSetter_setter, Exits, bc#102)
    0x22d0b740052a: mov 0x18(%rsi), %rdx
87:<!0:-> MovHint(Untyped:@82, MustGen, loc22, W:SideState, ClobbersExit, bc#102)
89:<!0:-> MovHint(Untyped:@81, MustGen, loc21, W:SideState, ClobbersExit, bc#102, ExitInvalid)
91:<!0:-> ExitOK(MustGen, W:SideState, bc#102)
92:<!0:-> CheckCell(Cell:@86, Untyped:@82, MustGen, <0x1035cc970, Function>, set#EoFqmc/<nogen>:[0x1035a1de0], Exits, bc#102)
    0x22d0b740052e: mov $0x1035cc970, %r11
    0x22d0b7400538: cmp %r11, %rdx
    0x22d0b740053b: jnz 0x22d0b7400887
--> set#EoFqmc:<0x103597620, bc#102, SetterCall, known callee: Cell: 0x1035cc970 (%BA:Function), ID: 49, numArgs+this = 2, stackOffset = -28 (loc0 maps to loc28)>
82:<!7:loc14> GetLocal(Untyped:@178, JS|MustGen|UseAsOther, Final, loc6(O<Final>/FlushedCell), machine:loc5, R:Stack(-7), bc#102)  predicting Final
    0x22d0b7400516: mov -0x30(%rbp), %rax
84:<!0:-> CheckStructure(Cell:@82, MustGen, [%CS:Object], R:JSCell_structureID, Exits, bc#102)
    0x22d0b740051a: cmp $0xff, (%rax)
    0x22d0b7400520: jnz 0x22d0b7400871
85:< 2:loc13> GetGetterSetterByOffset(KnownCell:@82, KnownCell:@82, JS|UseAsOther, Othercell, id5{a}, 0, inferredType = Top, R:NamedProperties(5), Exits, bc#102)
    0x22d0b7400526: mov 0x10(%rax), %rsi
86:< 1:loc9>  GetSetter(KnownCell:@85, JS|UseAsOther, Function, R:GetterSetter_setter, Exits, bc#102)
    0x22d0b740052a: mov 0x18(%rsi), %rdx
87:<!0:-> MovHint(Untyped:@82, MustGen, loc22, W:SideState, ClobbersExit, bc#102)
89:<!0:-> MovHint(Untyped:@81, MustGen, loc21, W:SideState, ClobbersExit, bc#102, ExitInvalid)
91:<!0:-> ExitOK(MustGen, W:SideState, bc#102)
92:<!0:-> CheckCell(Cell:@86, Untyped:@82, MustGen, <0x1035cc970, Function>, set#EoFqmc/<nogen>:[0x1035a1de0], Exits, bc#102)
    0x22d0b740052e: mov $0x1035cc970, %r11
    0x22d0b7400538: cmp %r11, %rdx
    0x22d0b740053b: jnz 0x22d0b7400887
--> set#EoFqmc:<0x103597620, bc#102, SetterCall, known callee: Cell: 0x1035cc970 (%BA:Function), ID: 49, numArgs+this = 2, stackOffset = -28 (loc0 maps to loc28)>
82 GetLocal(@178)
84 CheckStructure(@82)
85 GetGetterSetterByOffset(@82)
86 GetSetter(@85)
92 CheckCell(Cell:@86, ...)
--> set#EoFqmc:<... known callee ...>
80 CheckStructure(@78)
81 GetGetterSetterByOffset(@78)
82 GetSetter(@81)
88 Call(@82)
80 CheckStructure(@78)
81 GetGetterSetterByOffset(@78)
82 GetSetter(@81)
88 Call(@82)

now desert

method call
vs.
function call


function mk(word) {
  var len = word.length;
  if (len > 255) return undefined;
  var i = len >> 2;
  return String.fromCharCode(
    (word.charCodeAt(    0) & 0x03) << 14 |
    (word.charCodeAt(    i) & 0x03) << 12 |
    (word.charCodeAt(  i+i) & 0x03) << 10 |
    (word.charCodeAt(i+i+i) & 0x03) <<  8 |
    len
  );
}

Benchmark.prototype.setup = function() {
  function mk(word) {
    /* ... */
  }

  var MK = function() { };
  MK.prototype.mk = mk;
  var mker = new MK;
};

suite
  .add('Function', function() {
    var key = mk('www.wired.com');
    key = mk('www.youtube.com');
    key = mk('scorecardresearch.com');
    key = mk('www.google-analytics.com');
  })
  .add('Method', function() {
    var key = mker.mk('www.wired.com');
    key = mker.mk('www.youtube.com');
    key = mker.mk('scorecardresearch.com');
    key = mker.mk('www.google-analytics.com');
  })
$ d8 method-vs-function.js
Function
x 4,149,776 ops/sec ±0.62%
Method
x 682,273,122 ops/sec ±0.72%
Fastest is Method

method call
is faster?

keep secret
what I am going to show you now


--- a/method-function.js
+++ b/method-function.js
@@ -2,6 +2,9 @@
 load("../benchmark.js");

 Benchmark.prototype.setup = function() {
+   "Speed" + "your" + "JS" + "with" +
+   "this" + "one" + "weird" + "trick";
+
    function mk(word) {
         var len = word.length;
         if (len > 255) return undefined;
$ d8 method-vs-function.js
Function
x 695,708,197 ops/sec ±0.38%
Method
x 692,496,013 ops/sec ±0.29%
Fastest is Function,Method

Function was never optimized!

— heuristics that decide when to optimize are based (among other things) on the amount of initialized inline caches

— function call does not go through an IC, but method call does

Method had enough initialized ICs to trigger optimizations, but Function didn't!

— Until fake + ICs were added in the setup section

yada yada

measuring almost empty loop again!

can function call be faster than method call?


// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  // if (arr.length !== L)
  //   H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}
$ d8 iteration.js
WithCheck
x 396,792 ops/sec ±0.37%
WithoutCheck
x 456,653 ops/sec ±0.82%
Fastest is WithoutCheck (by 18%)

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    (0,H.throwConcurrentModificationError)(arr);
  var item = list[i];
  sum += item;
}
$ d8 iteration.js
WithCheck
x 396,792 ops/sec ±0.37%
WithoutCheck
x 456,653 ops/sec ±0.82%
WithHack
x 462,698 ops/sec ±0.48%
Fastest is WithoutCheck,WithHack

18% speedup by replacing
o.f()
with
(0,o.f)()
in the code that never executes

algorithms first

µbenchmarks last

THANK YOU

WAIT!

what about the first example?


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // quarterRound(x, 0, 4, 8,12);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) <<  8) | ((x[12] ^ x[ 0]) >>> 24);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) <<  7) | ((x[ 4] ^ x[ 8]) >>> 25);
    // ... so on ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

repetitive deopt is always V8 bug

— inlining U32TO8_LE used to break safe-uint32 analysis

— adding long comment into U32TO8_LE disables inlining

— there are more sane workarounds


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]|0);
  //                immediately truncate to int32 ^^
  input[12]++;
  return buffer;
}          
never assume that a language feature has to be slow

talk to VM people
report bugs