Skip to content

Commit 67a2d03

Browse files
authored
Enable caching of generated JS output (#25929)
Logical extension of what we were already doing for symbol lists. This caches the whole output of the JS compiler and re-uses it when settings and JS inputs are unchanged. This improves link time when the cache is hot (i.e. when the same program has been linked before). Hello world: 520ms -> 350ms Hello world + -sINCLUDE_FULL_LIBRARY: 588ms -> 370ms
1 parent 4227534 commit 67a2d03

File tree

3 files changed

+97
-70
lines changed

3 files changed

+97
-70
lines changed

ChangeLog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ See docs/process.md for more on how version tagging works.
2020

2121
4.0.22 (in development)
2222
-----------------------
23+
- Emscripten will now cache the JS code that it generates and re-use when
24+
linking with the same settings at a later date. This should improve link
25+
times generally but should especially noticeable when linking lots of small
26+
programs such as during autoconf or CMake feature detection. (#25929)
2327
- The minimum version of python required to run emscripten was updated from 3.8
2428
to 3.10. (#25891)
2529

tools/emscripten.py

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
headers, for the libc implementation in JS).
1010
"""
1111

12+
import glob
13+
import hashlib
1214
import json
1315
import logging
1416
import os
@@ -20,9 +22,11 @@
2022

2123
from tools import (
2224
building,
25+
cache,
2326
config,
2427
diagnostics,
2528
extract_metadata,
29+
filelock,
2630
js_manipulation,
2731
shared,
2832
utils,
@@ -173,6 +177,32 @@ def inject_code_hooks(name):
173177
return code
174178

175179

180+
@ToolchainProfiler.profile()
181+
def generate_js_compiler_input_hash(symbols_only=False):
182+
# We define a cache hit as when all the settings and all the `--js-library`
183+
# contents are identical.
184+
# Ignore certain settings that can are no relevant to library deps. Here we
185+
# skip PRE_JS_FILES/POST_JS_FILES which don't affect the library symbol list
186+
# and can contain full paths to temporary files.
187+
skip_settings = {'PRE_JS_FILES', 'POST_JS_FILES'}
188+
file_contents = [json.dumps(settings.external_dict(skip_keys=skip_settings), sort_keys=True, indent=2)]
189+
190+
files = glob.glob(utils.path_from_root('src/lib') + '/lib*.js')
191+
# Also, include the js compiler code itself, in case it gets locally modified.
192+
files += glob.glob(utils.path_from_root('src/*.mjs'))
193+
files += settings.JS_LIBRARIES
194+
if not symbols_only:
195+
files += settings.PRE_JS_FILES
196+
files += settings.POST_JS_FILES
197+
198+
for file in sorted(files):
199+
file_contents.append(utils.read_file(file))
200+
201+
content = '\n'.join(file_contents)
202+
content_hash = hashlib.sha1(content.encode('utf-8')).hexdigest()
203+
return content_hash
204+
205+
176206
@ToolchainProfiler.profile()
177207
def compile_javascript(symbols_only=False):
178208
stderr_file = os.environ.get('EMCC_STDERR_FILE')
@@ -189,15 +219,8 @@ def compile_javascript(symbols_only=False):
189219
args = ['-']
190220
if symbols_only:
191221
args += ['--symbols-only']
192-
out = shared.run_js_tool(path_from_root('tools/compiler.mjs'),
193-
args, input=settings_json, stdout=subprocess.PIPE, stderr=stderr_file)
194-
if symbols_only:
195-
glue = None
196-
forwarded_data = out
197-
else:
198-
assert '//FORWARDED_DATA:' in out, 'Did not receive forwarded data in pre output - process failed?'
199-
glue, forwarded_data = out.split('//FORWARDED_DATA:')
200-
return glue, forwarded_data
222+
return shared.run_js_tool(path_from_root('tools/compiler.mjs'),
223+
args, input=settings_json, stdout=subprocess.PIPE, stderr=stderr_file)
201224

202225

203226
def set_memory(static_bump):
@@ -275,6 +298,59 @@ def trim_asm_const_body(body):
275298
return body
276299

277300

301+
def get_cached_file(filetype, filename, generator, cache_limit):
302+
"""This function implements a file cache which lives inside the main
303+
emscripten cache directory but uses a per-file lock rather than a
304+
cache-wide lock.
305+
306+
The cache is pruned (by removing the oldest files) if it grows above
307+
a certain number of files.
308+
"""
309+
root = cache.get_path(filetype)
310+
utils.safe_ensure_dirs(root)
311+
312+
cache_file = os.path.join(root, filename)
313+
314+
with filelock.FileLock(cache_file + '.lock'):
315+
if os.path.exists(cache_file):
316+
# Cache hit, read the file
317+
file_content = utils.read_file(cache_file)
318+
else:
319+
# Cache miss, generate the symbol list and write the file
320+
file_content = generator()
321+
utils.write_file(cache_file, file_content)
322+
323+
if len([f for f in os.listdir(root) if not f.endswith('.lock')]) > cache_limit:
324+
with filelock.FileLock(cache.get_path(f'{filetype}.lock')):
325+
files = []
326+
for f in os.listdir(root):
327+
if not f.endswith('.lock'):
328+
f = os.path.join(root, f)
329+
files.append((f, os.path.getmtime(f)))
330+
files.sort(key=lambda x: x[1])
331+
# Delete all but the newest N files
332+
for f, _ in files[:-cache_limit]:
333+
with filelock.FileLock(f + '.lock'):
334+
utils.delete_file(f)
335+
336+
return file_content
337+
338+
339+
@ToolchainProfiler.profile()
340+
def compile_javascript_cached():
341+
# Avoiding using the cache when generating struct info since
342+
# this step is performed while the cache is locked.
343+
if DEBUG or settings.BOOTSTRAPPING_STRUCT_INFO or config.FROZEN_CACHE:
344+
return compile_javascript()
345+
346+
content_hash = generate_js_compiler_input_hash()
347+
348+
# Limit of the overall size of the cache.
349+
# This code will get test coverage since a full test run of `other` or `core`
350+
# generates ~1000 unique outputs.
351+
return get_cached_file('js_output', f'{content_hash}.js', compile_javascript, cache_limit=500)
352+
353+
278354
def emscript(in_wasm, out_wasm, outfile_js, js_syms, finalize=True, base_metadata=None):
279355
# Overview:
280356
# * Run wasm-emscripten-finalize to extract metadata and modify the binary
@@ -358,7 +434,9 @@ def emscript(in_wasm, out_wasm, outfile_js, js_syms, finalize=True, base_metadat
358434
if metadata.invoke_funcs:
359435
settings.DEFAULT_LIBRARY_FUNCS_TO_INCLUDE += ['$getWasmTableEntry']
360436

361-
glue, forwarded_data = compile_javascript()
437+
out = compile_javascript_cached()
438+
assert '//FORWARDED_DATA:' in out, 'Did not receive forwarded data in pre output - process failed?'
439+
glue, forwarded_data = out.split('//FORWARDED_DATA:', 1)
362440

363441
forwarded_json = json.loads(forwarded_data)
364442

tools/link.py

Lines changed: 5 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
# found in the LICENSE file.
55

66
import base64
7-
import glob
8-
import hashlib
97
import json
108
import logging
119
import os
@@ -25,7 +23,6 @@
2523
emscripten,
2624
extract_metadata,
2725
feature_matrix,
28-
filelock,
2926
js_manipulation,
3027
ports,
3128
shared,
@@ -218,47 +215,9 @@ def generate_js_sym_info():
218215
mode of the js compiler that would generate a list of all possible symbols
219216
that could be checked in.
220217
"""
221-
_, forwarded_data = emscripten.compile_javascript(symbols_only=True)
222-
# When running in symbols_only mode compiler.mjs outputs a flat list of C symbols.
223-
return json.loads(forwarded_data)
224-
225-
226-
def get_cached_file(filetype, filename, generator, cache_limit):
227-
"""This function implements a file cache which lives inside the main
228-
emscripten cache directory but uses a per-file lock rather than a
229-
cache-wide lock.
230-
231-
The cache is pruned (by removing the oldest files) if it grows above
232-
a certain number of files.
233-
"""
234-
root = cache.get_path(filetype)
235-
utils.safe_ensure_dirs(root)
236-
237-
cache_file = os.path.join(root, filename)
238-
239-
with filelock.FileLock(cache_file + '.lock'):
240-
if os.path.exists(cache_file):
241-
# Cache hit, read the file
242-
file_content = read_file(cache_file)
243-
else:
244-
# Cache miss, generate the symbol list and write the file
245-
file_content = generator()
246-
write_file(cache_file, file_content)
247-
248-
if len([f for f in os.listdir(root) if not f.endswith('.lock')]) > cache_limit:
249-
with filelock.FileLock(cache.get_path(f'{filetype}.lock')):
250-
files = []
251-
for f in os.listdir(root):
252-
if not f.endswith('.lock'):
253-
f = os.path.join(root, f)
254-
files.append((f, os.path.getmtime(f)))
255-
files.sort(key=lambda x: x[1])
256-
# Delete all but the newest N files
257-
for f, _ in files[:-cache_limit]:
258-
with filelock.FileLock(f + '.lock'):
259-
delete_file(f)
260-
261-
return file_content
218+
output = emscripten.compile_javascript(symbols_only=True)
219+
# When running in symbols_only mode compiler.mjs outputs symbol metadata as JSON.
220+
return json.loads(output)
262221

263222

264223
@ToolchainProfiler.profile_block('JS symbol generation')
@@ -268,21 +227,7 @@ def get_js_sym_info():
268227
if DEBUG or settings.BOOTSTRAPPING_STRUCT_INFO or config.FROZEN_CACHE:
269228
return generate_js_sym_info()
270229

271-
# We define a cache hit as when the settings and `--js-library` contents are
272-
# identical.
273-
# Ignore certain settings that can are no relevant to library deps. Here we
274-
# skip PRE_JS_FILES/POST_JS_FILES which don't effect the library symbol list
275-
# and can contain full paths to temporary files.
276-
skip_settings = {'PRE_JS_FILES', 'POST_JS_FILES'}
277-
input_files = [json.dumps(settings.external_dict(skip_keys=skip_settings), sort_keys=True, indent=2)]
278-
jslibs = glob.glob(utils.path_from_root('src/lib') + '/lib*.js')
279-
# Also, include the js compiler code itself, in case it gets locally modified.
280-
jslibs += glob.glob(utils.path_from_root('src/*.mjs'))
281-
jslibs = sorted(jslibs) + settings.JS_LIBRARIES
282-
for jslib in jslibs:
283-
input_files.append(read_file(jslib))
284-
content = '\n'.join(input_files)
285-
content_hash = hashlib.sha1(content.encode('utf-8')).hexdigest()
230+
content_hash = emscripten.generate_js_compiler_input_hash(symbols_only=True)
286231

287232
def generate_json():
288233
library_syms = generate_js_sym_info()
@@ -291,7 +236,7 @@ def generate_json():
291236
# Limit of the overall size of the cache.
292237
# This code will get test coverage since a full test run of `other` or `core`
293238
# generates ~1000 unique symbol lists.
294-
file_content = get_cached_file('symbol_lists', f'{content_hash}.json', generate_json, cache_limit=500)
239+
file_content = emscripten.get_cached_file('symbol_lists', f'{content_hash}.json', generate_json, cache_limit=500)
295240
return json.loads(file_content)
296241

297242

0 commit comments

Comments
 (0)