$ d8 concat.js
Inside x 198,893,290 ops/sec ±1.08%
Outside x 674,248,118 ops/sec ±1.08%
Fastest is Outside

quick! move all your strings out of functions

µbench 101

want to know the cost of OP


function benchmark() {
  var start = Date.now();
  /* OP */
  return (Date.now() - start);
}
          

what if OP is faster than resolution of our clock?


function benchmark(N) {
  var start = Date.now();
  for (var i = 0; i < N; i++) {
    /* OP */
  }
  return (Date.now() - start) / N;
}
          

\[C\]

\[C \times N\]

\[\frac{C \times N}{N}\]

\[\frac{C \times \cancel{N}}{\cancel{N}}\]

naive math

vs

smart VMs

\[C \times N\]

\[C_u N_u + C_o N_o\]

\[{\sum C_{i} N_i}\]

\[{\sum C_{i} N_i} +\]

\[\frac{1}{\sqrt{2\pi}}\int C(\xi) e ^ {-\frac{\xi^2}{2}} d\xi \]

math is hard

lets look at the code

IRHydra2

all samples from this talk are available as demos within the tool

why Outside is fast(er)?

hmm, hard to see the forest behind the trees

lets activate an "interesting IR" mode

  • "uninteresting" IR instructions are hidden
  • source is spliced into IR
  • loops are marked with red stripes on the left

By the way: that was core loop of our benchmark and it was EMPTY

only iteration variable increment and loop itself remained

so Outside is fast because it is measuring empty loop

how did this happen?

know basic optimizations

inlining


function inner() {
  return 'a' + 'b'
}

for (...) {
  inner();
}
          

function inner() {
  return 'a' + 'b'
}

for (...) {
  inner();
}
          

function inner() {
  return 'a' + 'b'
}

for (...) {
  'a' + 'b';
}
          

constant folding


for (...) {
  'a' + 'b';
}
          

for (...) {
  'ab';
}
          

loop invariant code motion


var str1, str2;
for (...) {
  str1 + str2;
}
          

var str1, str2;
var invS = str1 + str2;
for (...) {
  // use invS here
}
          

dead code elimination


var str1, str2;
var invS = str1 + str2;
// nobody uses invS?
          

var str1, str2;
// FARVEL!
          

there are more!

but what happened to Inside?

V8 failed to constant fold

actually a bug

lets be fixing a bug


--- src/hydrogen.cc (revision 21348)
+++ src/hydrogen.cc (working copy)
@@ -9639,6 +9639,14 @@
       return left;
     }

+    if (FLAG_fold_constants &&
+        left->IsConstant() && HConstant::cast(left)->HasStringValue() &&
+        right->IsConstant() && HConstant::cast(right)->HasStringValue()) {
+      return AddUncasted<HStringAdd>(
+          left, right, allocation_mode.GetPretenureMode(),
+          STRING_ADD_CHECK_NONE, allocation_mode.feedback_site());
+    }
+
     // Register the dependent code with the allocation site.
     if (!allocation_mode.feedback_site().is_null()) {
       ASSERT(!graph()->info()->IsStub());
$ d8 concat.js
Inside x 758,530,478 ops/sec ±1.82%
Outside x 674,248,118 ops/sec ±1.14%
Fastest is Inside

reminder: both loops are doing nothing now!

presumption of performance

reasonable code

reasonably fast

confirmation bias

«prototype chains are slow»


var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
          

var obj =
  Object.create(
    Object.create(
      Object.create(
        Object.create(
          Object.create({prop: 10})))));
                  // LISP tribute ^^^^^
          

function doManyLookups() {
  var counter = 0;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += obj.prop;
  print('In total: ' + counter);
}
          

function lookupAndCache() {
  var counter = 0;
  var value = obj.prop;
  for(var i = 0; i < 1000; i++)
    for(var j = 0; j < 1000; j++)
      for(var k = 0; k < 1000; k++)
        counter += value;
  print('In total: ' + counter);
}
          

// State of art benchmark driver.
function measure(f) {
  var start = Date.now();
  f();
  var end = Date.now();
  print(f.name + ' took ' +
        (end - start) + ' ms.');
}
          

measure(doManyLookups);
measure(lookupAndCache);
          
$ node prototype.js
In total: 10000000000
doManyLookups took 8243 ms.
In total: 10000000000
lookupAndCache took 1058 ms.

CONFIRMED

lets make it harder


-   Object.create({prop: 10})))));
+   Object.create({ get prop () { return 10 }})))));
$ node prototype.js
In total: 10000000000
doManyLookups took 1082 ms.
In total: 10000000000
lookupAndCache took 1061 ms.

«what kind of voodoo is this?»

getter was completely inlined

all prototype traversal checks were hoisted

data property is handled in a generic way

through an inline cache

why?

older V8 did not inline loads of data properties defined on prototypes

trying newer V8

$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.
$ d8 prototype.js
In total: 10000000000
doManyLookups took 1294 ms.
In total: 10000000000
lookupAndCache took 1189 ms.

prototype chain traversal got LICMed

now something different

what if we run benchmark twice?


measure(doManyLookups);
measure(doManyLookups);
measure(lookupAndCache);
measure(lookupAndCache);
          
$ d8 prototype.js | grep took
doManyLookups took 1301 ms.
doManyLookups took 3408 ms.
lookupAndCache took 1204 ms.
lookupAndCache took 3406 ms.
        

what happened here?

turns out: 'In total: ' + counter type-feedback leaks upwards into the loop and causes excessive boxing of the counter variable

solution: hide + from representation inference


-   print('In total: ' + counter);
+   print('In total: ' + counter.toString());
$ d8 prototype.js | grep took
doManyLookups took 1298 ms.
doManyLookups took 1119 ms.
lookupAndCache took 1188 ms.
lookupAndCache took 982 ms.
        

«huh?»

and now for the desert!

is method call faster than function call?


function mk(word) {
    var len = word.length;
    if (len > 255) return undefined;
    var i = len >> 2;
    return String.fromCharCode(
        (word.charCodeAt(    0) & 0x03) << 14 |
        (word.charCodeAt(    i) & 0x03) << 12 |
        (word.charCodeAt(  i+i) & 0x03) << 10 |
        (word.charCodeAt(i+i+i) & 0x03) <<  8 |
        len
    );
}

Benchmark.prototype.setup = function() {
   function mk(word) {
        /* ... */
    }

    var MK = function() { };
    MK.prototype.mk = mk;
    var mker = new MK;
};

suite
  .add('Function', function() {
    var key = mk('www.wired.com');
    key = mk('www.youtube.com');
    key = mk('scorecardresearch.com');
    key = mk('www.google-analytics.com');
  })
  .add('Method', function() {
    var key = mker.mk('www.wired.com');
    key = mker.mk('www.youtube.com');
    key = mker.mk('scorecardresearch.com');
    key = mker.mk('www.google-analytics.com');
  })
$ d8 method-vs-function.js
Function
x 4,149,776 ops/sec ±0.62%
Method
x 682,273,122 ops/sec ±0.72%
Fastest is Method

so method call IS faster than function call?


--- a/method-function.js
+++ b/method-function.js
@@ -2,6 +2,9 @@
 load("../benchmark.js");

 Benchmark.prototype.setup = function() {
+   "Speed" + "your" + "JS" + "with" +
+   "this" + "one" + "weird" + "trick";
+
    function mk(word) {
         var len = word.length;
         if (len > 255) return undefined;
$ d8 method-vs-function.js
Function
x 695,708,197 ops/sec ±0.38%
Method
x 692,496,013 ops/sec ±0.29%
Fastest is Function,Method

Function was never optimized!

heuristics that decide when to optimize are based (among other things) on the amount of initialized inline caches

function call does not go through an IC, but method call does

Method had enough initialized ICs to trigger optimizations, but Function didn't!

Until fake + ICs were added in the setup section

now Function is optimized!

... and it's body is almost completely LICMed

SOURCE view is easier

measuring almost empty loop again!

file bugs instead of making conclusions

thank you