Java VM

V8

Dart VM

complexity

ChaCha20


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    quarterRound(x, 0, 4, 8,12);
    quarterRound(x, 1, 5, 9,13);
    quarterRound(x, 2, 6,10,14);
    quarterRound(x, 3, 7,11,15);
    quarterRound(x, 0, 5,10,15);
    quarterRound(x, 1, 6,11,12);
    quarterRound(x, 2, 7, 8,13);
    quarterRound(x, 3, 4, 9,14);
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

19MB/s


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    quarterRound(x, 0, 4, 8,12);
    quarterRound(x, 1, 5, 9,13);
    quarterRound(x, 2, 6,10,14);
    quarterRound(x, 3, 7,11,15);
    quarterRound(x, 0, 5,10,15);
    quarterRound(x, 1, 6,11,12);
    quarterRound(x, 2, 7, 8,13);
    quarterRound(x, 3, 4, 9,14);
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // quarterRound(x, 0, 4, 8,12);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) <<  8) | ((x[12] ^ x[ 0]) >>> 24);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) <<  7) | ((x[ 4] ^ x[ 8]) >>> 25);
    // ... so on ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

2MB/s


function U32TO8_LE(x, i, u) {
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

function U32TO8_LE(x, i, u) {
    //  _  _   __   __  _______  _______    _______  _______  __   __  _  _
    // | || | |  | |  ||       ||       |  |   _   ||       ||  |_|  || || |
    // |_||_| |  | |  ||  _____||    ___|  |  |_|  ||  _____||       ||_||_|
    //        |  |_|  || |_____ |   |___   |       || |_____ |       |
    //        |       ||_____  ||    ___|  |       ||_____  ||       |
    //        |       | _____| ||   |___   |   _   | _____| || ||_|| |
    //        |_______||_______||_______|  |__| |__||_______||_|   |_|
    //
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

19MB/s


function U32TO8_LE(x, i, u) {
    //  _  _   __   __  _______  _______    _______  _______  __   __  _  _
    // | || | |  | |  ||       ||       |  |   _   ||       ||  |_|  || || |
    // |_||_| |  | |  ||  _____||    ___|  |  |_|  ||  _____||       ||_||_|
    //        |  |_|  || |_____ |   |___   |       || |_____ |       |
    //        |       ||_____  ||    ___|  |       ||_____  ||       |
    //        |       | _____| ||   |___   |   _   | _____| || ||_|| |
    //        |_______||_______||_______|  |__| |__||_______||_|   |_|
    //
    // ^^^^^^ Don't remove this comment it makes the code 10x faster. ^^^^^^
    x[i]   = u; u >>>= 8;
    x[i+1] = u; u >>>= 8;
    x[i+2] = u; u >>>= 8;
    x[i+3] = u;
}

...

we say performance
we mean jsperf.com


// split
str.split("e").length - 1;

// while
var count = 0;
while (str.length) {
  if (str.charAt(0) === "e") {
    count += 1;
  }
  str = str.slice(1);
}
          

// split (IT'S OVER 9000K OP/S ON FF!)
str.split("e").length - 1;

// while
var count = 0;
while (str.length) {
  if (str.charAt(0) === "e") {
    count += 1;
  }
  str = str.slice(1);
}
          

DOUBT

EVERYTHING

µbench 101

the cost of OP?


function benchmark() {
  var start = Date.now();
  /* OP */
  return (Date.now() - start);
}
          

what if OP faster than clock?


function benchmark(N) {
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    /* OP */
  }
  return (Date.now() - start) / N;
}
          

\[C\]

\[C \times N\]

\[\frac{C \times N}{N}\]

\[\frac{C \times \cancel{N}}{\cancel{N}}\]






    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }




function benchmark() {
  var str = "...";
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    var count = 0;
    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var str = "...";
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    var count = 0;
    while (str.length) {
      if (str.charAt(0) === "e") {
        count += 1;
      }
      str = str.slice(1);
    }
  }
  return (Date.now() - start) / N;
}
          

repetitions start with str === ""

\[C \times N\]

\[C \times 1 + 0 \times (N - 1)\]

\[\frac{C \times 1 + 0 \times (N - 1)}{N}\]

\[\frac{C}{N}\]

\[\frac{C}{N} \approx 0\]

jsperf misuse

OP should take the same time

when repeatedly executed

«~~ for the win!»

NOPE

JITs are clever

JITs optimize
as program runs

\[C \times N\]

\[C_u N_u + C_o N_o\]

[unoptimized + optimized version]

\[{\sum C_{i} N_i}\]

[multiple unoptimized and
optimized versions]

\[{\sum C_{i} N_i} + \color{#dc1e1a}{\frac{1}{\sqrt{2\pi}}\int C(\xi) e ^ {-\frac{\xi^2}{2}} d\xi}\]

[math gets too complicated to approach analytically]

JITs are clever

to measure you need to outsmart

Disclaimer: On slides I will show optimizations as source-to-source transformations. In reality compilers operate on more sophisticated representations.


function benchmark() {
  var i = '1561565', j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~'1561565';
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~-1561566;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = 1561565;
  }
  return (Date.now() - start) / N;
}
          

constant propagation

«hey, I can trick the compiler!»


function benchmark() {
  var i = Date.now().toString(), j;
  //  ^ not a constant any more
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
  }
  return (Date.now() - start) / N;
}
          

«Wrong answer, McFly!»


function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    j = ~~i;
    //  ^^^ loop invariant
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  //  ^^^^^^^^ hoisted from the loop
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

loop invariant code motion


function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var i = Date.now().toString(), j;
  var start = Date.now();
  var j0 = ~~i;
  for (var n = 0; n < N; n++) {
    j = j0;
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    /* sound of silence */
  }
  return (Date.now() - start) / N;
}
          

function benchmark() {
  var start = Date.now();
  /*
   * sound of silence
   */
  return (Date.now() - start) / N;
}
          

dead code elimination


// Remember this one?
// split (IT'S OVER 9000K OP/S ON FF!)
str.split("e").length - 1;
// it's full of dead code.

optimizer eats µbenchmarks
for breakfast

chew proof µbenchmarks?

do not try
to write them

  1. verify results
  2. no constants
  3. no loop invariants
  4. no dead code


function benchmark() {
  var i = Date.now().toString(),
      i1 = Date.now().toString(), t;
  var j;
  var start = Date.now();
  for (var n = 0; n < N; n++) {
    t = i; i = i1; i1 = t;
    j = ~~i;
  }
  if (i != j || i1 != j) throw "whoa?";
  return (Date.now() - start) / N;
}
          

not bad

(potentially)

still not enough


for (var n = 0; n < N; n++) {
  t = i; i = i1; i1 = t;
  j = ~~i;
}
          

for (var n = 0; n < (N / 2); n++) {
  t = i; i = i1; i1 = t;
  j = ~~i;
  t = i; i = i1; i1 = t;
  j = ~~i;              
}
if ((N % 2) === 1) {
  t = i; i = i1; i1 = t;
  j = ~~i;
} 
          

for (var n = 0; n < (N / 2); n++) {
  j = ~~i1;
  t = i1;
  j = ~~i;
}
          

for (var n = 0; n < (N / 2); n++) {
  j = ~~i1; /* dead */
  t = i1;   /* dead */
  j = ~~i;  /* invariant */
}
          

loop unrolling

V8 doesn't do it

but I want to induce paranoia ☺

µbench ⚔ VM

$ d8 concat.js
Inside x 198,893,290 ops/sec ±1.08%
Outside x 674,248,118 ops/sec ±1.08%
Fastest is Outside

«quick! move all your strings out of functions»

time to look
at the code

we expect both variants to be DCEd

IRHydra2

all samples from this talk
are demos within the tool

why Outside is fast(er)?

the forest hidden behind the trees

Outside is fast
at doing nothing

V8 failed to constant fold

[bug!]

lets fix it!


--- src/hydrogen.cc (revision 21348)
+++ src/hydrogen.cc (working copy)
@@ -9639,6 +9639,14 @@
       return left;
     }

+    if (FLAG_fold_constants &&
+        left->IsConstant() && HConstant::cast(left)->HasStringValue() &&
+        right->IsConstant() && HConstant::cast(right)->HasStringValue()) {
+      return AddUncasted<HStringAdd>(
+          left, right, allocation_mode.GetPretenureMode(),
+          STRING_ADD_CHECK_NONE, allocation_mode.feedback_site());
+    }
+
     // Register the dependent code with the allocation site.
     if (!allocation_mode.feedback_site().is_null()) {
       ASSERT(!graph()->info()->IsStub());
$ d8 concat.js
Inside x 758,530,478 ops/sec ±1.82%
Outside x 674,248,118 ops/sec ±1.14%
Fastest is Inside

both are now doing nothing!

presumption of performance

reasonable code
reasonably fast

confirmation
bias

«prototype chains are slow»


var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
          

var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
                  // LISP tribute ^^^^^
          

function doManyLookups() {
  var counter = 0;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += obj.prop;
  print('In total: ' + counter);
}
          

function lookupAndCache() {
  var counter = 0;
  var value = obj.prop;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += value;
  print('In total: ' + counter);
}
          

// State of art benchmark driver.
function measure(f) {
  var start = Date.now();
  f();
  var end = Date.now();
  print(f.name + ' took ' +
        (end - start) + ' ms.');
}
          

measure(doManyLookups);
measure(lookupAndCache);
          
$ node prototype.js
In total: 10000000000
doManyLookups took 8243 ms.
In total: 10000000000
lookupAndCache took 1058 ms.

CONFIRMED

lets make it harder


-   Object.create({ prop: 10 })))));
+   Object.create({ get prop () { return 10 } })))));
$ node prototype.js
In total: 10000000000
doManyLookups took 1082 ms.
In total: 10000000000
lookupAndCache took 1061 ms.

«what kind of voodoo is this?»

very old V8 did not inline loads from data properties defined on prototypes

trying newer V8

$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.
$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.

prototype chain traversal got LICMed

... and now for something completely different

what if we run benchmark
twice?


measure(doManyLookups);
measure(doManyLookups);
measure(lookupAndCache);
measure(lookupAndCache);
          
$ d8 prototype.js | grep took
doManyLookups took 1301 ms.
doManyLookups took 3408 ms.
lookupAndCache took 1204 ms.
lookupAndCache took 3406 ms.
        

what just
happened here?

'In total: ' + counter type-feedback leaks upwards into the loop and causes excessive boxing of the counter variable

workaround: hide + from representation inference


-   print('In total: ' + counter);
+   print('In total: ' + counter.toString());
$ d8 prototype.js | grep took
doManyLookups took 1298 ms.
doManyLookups took 1119 ms.
lookupAndCache took 1188 ms.
lookupAndCache took 982 ms.
        

now desert!

method call
vs.
function call


function mk(word) {
  var len = word.length;
  if (len > 255) return undefined;
  var i = len >> 2;
  return String.fromCharCode(
    (word.charCodeAt(    0) & 0x03) << 14 |
    (word.charCodeAt(    i) & 0x03) << 12 |
    (word.charCodeAt(  i+i) & 0x03) << 10 |
    (word.charCodeAt(i+i+i) & 0x03) <<  8 |
    len
  );
}

Benchmark.prototype.setup = function() {
  function mk(word) {
    /* ... */
  }

  var MK = function() { };
  MK.prototype.mk = mk;
  var mker = new MK;
};

suite
  .add('Function', function() {
    var key = mk('www.wired.com');
    key = mk('www.youtube.com');
    key = mk('scorecardresearch.com');
    key = mk('www.google-analytics.com');
  })
  .add('Method', function() {
    var key = mker.mk('www.wired.com');
    key = mker.mk('www.youtube.com');
    key = mker.mk('scorecardresearch.com');
    key = mker.mk('www.google-analytics.com');
  })
$ d8 method-vs-function.js
Function
x 4,149,776 ops/sec ±0.62%
Method
x 682,273,122 ops/sec ±0.72%
Fastest is Method

method call
is faster?

keep secret
what I am going to show you now


--- a/method-function.js
+++ b/method-function.js
@@ -2,6 +2,9 @@
 load("../benchmark.js");

 Benchmark.prototype.setup = function() {
+   "Speed" + "your" + "JS" + "with" +
+   "this" + "one" + "weird" + "trick";
+
    function mk(word) {
         var len = word.length;
         if (len > 255) return undefined;
$ d8 method-vs-function.js
Function
x 695,708,197 ops/sec ±0.38%
Method
x 692,496,013 ops/sec ±0.29%
Fastest is Function,Method

Function was never optimized!

— heuristics that decide when to optimize are based (among other things) on the amount of initialized inline caches

— function call does not go through an IC, but method call does

Method had enough initialized ICs to trigger optimizations, but Function didn't!

— Until fake + ICs were added in the setup section

yada yada

measuring almost empty loop again!

can function call be faster than method call?


// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  // if (arr.length !== L)
  //   H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}
$ d8 iteration.js
WithCheck
x 396,792 ops/sec ±0.37%
WithoutCheck
x 456,653 ops/sec ±0.82%
Fastest is WithoutCheck (by 18%)

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    H.throwConcurrentModificationError(arr);
  var item = list[i];
  sum += item;
}

// for (var item in list) { ... }
var sum = 0;
for (var i = 0, L = arr.length;
     i < arr.length;
     ++i) {
  if (arr.length !== L)
    (0,H.throwConcurrentModificationError)(arr);
  var item = list[i];
  sum += item;
}
$ d8 iteration.js
WithCheck
x 396,792 ops/sec ±0.37%
WithoutCheck
x 456,653 ops/sec ±0.82%
WithHack
x 462,698 ops/sec ±0.48%
Fastest is WithoutCheck,WithHack

18% speedup by replacing
o.f()
with
(0,o.f)()
in the code that never executes

algorithms first

µbenchmarks last

THANK YOU

WAIT!

what about the first example?


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // quarterRound(x, 0, 4, 8,12);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20);
    x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) <<  8) | ((x[12] ^ x[ 0]) >>> 24);
    x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) <<  7) | ((x[ 4] ^ x[ 8]) >>> 25);
    // ... so on ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
  input[12]++;
  return buffer;
}          

repetitive deopt is always V8 bug

— inlining U32TO8_LE used to break safe-uint32 analysis

— adding long comment into U32TO8_LE disables inlining

— there are more sane workarounds


function getBlock(buffer) {
  var x = new Uint32Array(16);

  for (var i = 16; i--;) x[i] = input[i];
  for (var i = 20; i > 0; i -= 2) {
    // ...
  }
  for (i = 16; i--;) x[i] += input[i];
  for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]|0);
  //                immediately truncate to int32 ^^
  input[12]++;
  return buffer;
}          
never assume that a language feature has to be slow

talk to VM people
report bugs