From d25c882fda9301c1d3bce78622dca0f9e8706b9a Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 27 Dec 2019 17:19:25 -0800 Subject: [PATCH 01/18] Load JSON-LD from HTML documents. * Adds options parameter to documentLoader * Uses xmldom, if loaded. * Adds util.ParseContentTypeHeader * Adds documentLoader implementations for xhr and node (still requires tests). --- lib/ContextResolver.js | 2 +- lib/documentLoaders/node.js | 33 +++++++--- lib/documentLoaders/xhr.js | 26 ++++++-- lib/jsonld.js | 91 +++++++++++++++++++++++++--- lib/util.js | 29 +++++++++ tests/test-common.js | 116 +++++++++++++----------------------- 6 files changed, 202 insertions(+), 95 deletions(-) diff --git a/lib/ContextResolver.js b/lib/ContextResolver.js index e70ba98a..83d97ccf 100644 --- a/lib/ContextResolver.js +++ b/lib/ContextResolver.js @@ -163,7 +163,7 @@ module.exports = class ContextResolver { let remoteDoc; try { - remoteDoc = await documentLoader(url); + remoteDoc = await documentLoader(url, {}); context = remoteDoc.document || null; // parse string context as JSON if(_isString(context)) { diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js index 88439b3b..717d7e5e 100644 --- a/lib/documentLoaders/node.js +++ b/lib/documentLoaders/node.js @@ -3,7 +3,11 @@ */ 'use strict'; -const {parseLinkHeader, buildHeaders} = require('../util'); +const { + parseLinkHeader, + buildHeaders, + parseContentTypeHeader +} = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); const RequestQueue = require('../RequestQueue'); @@ -38,11 +42,11 @@ module.exports = ({ const http = require('http'); const queue = new RequestQueue(); - return queue.wrapLoader(function(url) { - return loadDocument(url, []); + return queue.wrapLoader(function(url, options) { + return loadDocument(url, options, []); }); - async function loadDocument(url, redirects) { + async function loadDocument(url, options, redirects) { if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) { throw new JsonLdError( 'URL could not be dereferenced; only "http" and "https" URLs are ' + @@ -61,6 +65,12 @@ module.exports = ({ return doc; } + // add any optional requestProfile + if(options.requestProfile) { + headers.Accept = + headers.Accept + ", application/ld+json;profile=${options.requestProfile}"; + } + let result; let alternate = null; try { @@ -78,8 +88,17 @@ module.exports = ({ } const {res, body} = result; + const {contentType, params} = parseContentTypeHeader(res.headers['content-type']); + + doc = { + contextUrl: null, + documentUrl: url, + document: body || null, + contentType: contentType, + profile: params.profile + }; - doc = {contextUrl: null, documentUrl: url, document: body || null}; + // separate profile from content-type // handle error const statusText = http.STATUS_CODES[res.statusCode]; @@ -95,7 +114,7 @@ module.exports = ({ // handle Link Header if(res.headers.link && - res.headers['content-type'] !== 'application/ld+json') { + contentType !== 'application/ld+json') { // only 1 related link header permitted const linkHeaders = parseLinkHeader(res.headers.link); const linkedContext = linkHeaders[LINK_HEADER_CONTEXT]; @@ -144,7 +163,7 @@ module.exports = ({ }); } redirects.push(url); - return loadDocument(res.headers.location, redirects); + return loadDocument(res.headers.location, options, redirects); } // cache for each redirected URL diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index f1c53e74..cb27a9c7 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -3,7 +3,11 @@ */ 'use strict'; -const {parseLinkHeader, buildHeaders} = require('../util'); +const { + parseLinkHeader, + buildHeaders, + parseContentTypeHeader +} = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); const RequestQueue = require('../RequestQueue'); @@ -31,7 +35,7 @@ module.exports = ({ const queue = new RequestQueue(); return queue.wrapLoader(loader); - async function loader(url) { + async function loader(url, options) { if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) { throw new JsonLdError( 'URL could not be dereferenced; only "http" and "https" URLs are ' + @@ -45,6 +49,12 @@ module.exports = ({ 'jsonld.InvalidUrl', {code: 'loading document failed', url}); } + // add any optional requestProfile + if(options.requestProfile) { + headers.Accept = + headers.Accept + ", application/ld+json;profile=${options.requestProfile}"; + } + let req; try { req = await _get(xhr, url, headers); @@ -65,11 +75,19 @@ module.exports = ({ }); } - let doc = {contextUrl: null, documentUrl: url, document: req.response}; + const {contentType, params} = + parseContentTypeHeader(req.getResponseHeader('Content-Type')); + + let doc = { + contextUrl: null, + documentUrl: url, + document: req.response, + contentType: contentType, + profile: params.profile + }; let alternate = null; // handle Link Header (avoid unsafe header warning by existence testing) - const contentType = req.getResponseHeader('Content-Type'); let linkHeader; if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) { linkHeader = req.getResponseHeader('Link'); diff --git a/lib/jsonld.js b/lib/jsonld.js index ebd91bf5..b1fc0976 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -42,6 +42,7 @@ const LRU = require('lru-cache'); const NQuads = require('./NQuads'); const Rdfa = require('./Rdfa'); +const {prependBase: _prependBase} = require ('./url'); const {expand: _expand} = require('./expand'); const {flatten: _flatten} = require('./flatten'); const {fromRDF: _fromRDF} = require('./fromRdf'); @@ -862,6 +863,9 @@ jsonld.documentLoader = async url => { * @param url the URL to fetch. * @param [options] the options to use: * [documentLoader] the document loader to use. + * [extractAllScripts] concatenates all matching script elements.. + * [profile] used when selecting from HTML script elements. + * [requestProfile] one or more profile IRIs to use in the request. * * @return a Promise that resolves to the retrieved remote document. */ @@ -873,7 +877,10 @@ jsonld.get = async function(url, options) { load = jsonld.documentLoader; } - const remoteDoc = await load(url); + // FIXME: unescape frag? + const [reference, frag] = url.split('#', 2); + + const remoteDoc = await load(reference, options); try { if(!remoteDoc.document) { @@ -882,16 +889,68 @@ jsonld.get = async function(url, options) { 'jsonld.NullRemoteDocument'); } if(_isString(remoteDoc.document)) { - remoteDoc.document = JSON.parse(remoteDoc.document); + if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) { + const domParser = new jsonld.domParser(); + const dom = domParser.parseFromString(remoteDoc.document); + + // Use any document base + const baseElem = dom.getElementsByTagName('base'); + if(baseElem.length > 0) { + const href = baseElem[0].getAttribute('href'); + options.base = _prependBase(options.base || reference, href); + } + + const scripts = dom.getElementsByTagName('script'); + remoteDoc.document = []; + + for(let i = 0; i < scripts.length; i++) { + const script = scripts[i]; + // only application/ld+json + if(!script.getAttribute('type').startsWith('application/ld+json')) { + continue; + } + // If url has a fragment identifier, only matching scripts + if(frag && script.getAttribute('id') !== frag) { + continue; + } + try { + remoteDoc.document.push(JSON.parse(script.textContent)); + } catch(e) { + throw new JsonLdError( + 'Illegal script content.', + 'jsonld.InvalidScriptElement', { + code: 'invalid script element', + remoteDoc + }); + } + } + if(frag && remoteDoc.document.length === 0) { + throw new JsonLdError( + 'No script tag found with id=${frag}.', + 'jsonld.InvalidScriptElement', { + code: 'invalid script element', + remoteDoc + }); + } + if(!options.extractAllScripts) { + remoteDoc.document = remoteDoc.document[0]; + } + } else { + remoteDoc.document = JSON.parse(remoteDoc.document); + } } } catch(e) { - throw new JsonLdError( - 'Could not retrieve a JSON-LD document from the URL.', - 'jsonld.LoadDocumentError', { - code: 'loading document failed', - cause: e, - remoteDoc - }); + if(e.name === 'jsonld.InvalidScriptElement') { + throw(e) + } else { + throw new JsonLdError( + 'Could not retrieve a JSON-LD document from the URL.', + 'jsonld.LoadDocumentError', { + code: 'loading document failed', + cause: e, + remoteDoc + }); + } } return remoteDoc; @@ -942,6 +1001,20 @@ jsonld.documentLoaders = {}; jsonld.documentLoaders.node = require('./documentLoaders/node'); jsonld.documentLoaders.xhr = require('./documentLoaders/xhr'); +// Optional DOM parser +try { + jsonld.domParser = require('xmldom').DOMParser; +} catch(e) { + jsonld.domParser = class NoDOMParser { + parseFromString() { + throw new JsonLdError( + 'Could not parse HTML document. ' + + 'HTML parsing not implemented.', 'jsonld.LoadDocumentError', + {code: 'loading document failed'}); + } + }; +} + /** * Assigns the default document loader for external document URLs to a built-in * default. Supported types currently include: 'xhr' and 'node'. diff --git a/lib/util.js b/lib/util.js index 77da8f61..b813f849 100644 --- a/lib/util.js +++ b/lib/util.js @@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/; const REGEX_LINK_HEADER_PARAMS = /(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g; +// FIXME: conditinally support text/html const DEFAULTS = { headers: { accept: 'application/ld+json, application/json' @@ -142,6 +143,34 @@ api.parseLinkHeader = header => { return rval; }; +/** + * Parses a content-type header. The results will be key'd by the value of "rel". + * + * Accept: application/ld+json + * + * Parses as: ["application/ld+json", {}] + * + * Accept: application/ld+json;profile=http://www.w3.org/ns/json-ld#context + * + * Parses as: ["application/ld+json", {profile: "http://www.w3.org/ns/json-ld#context"}] + * + * If there is more than one + * + * @param header the content-type header to parse. + */ +api.parseContentTypeHeader = header => { + const [type, ...rest] = header.split(';'); + const params = {}; + const rval = [type.trim(), params]; + + // assign parameters + for(const paramString of rest) { + const [param, value] = paramString.split('='); + params[param.trim().toLowerCase()] = value.trim(); + } + return rval; +}; + /** * Throws an exception if the given value is not a valid @type value. * diff --git a/tests/test-common.js b/tests/test-common.js index b51c7555..7f94542c 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -35,11 +35,6 @@ const TEST_TYPES = { // NOTE: idRegex format: //MMM-manifest#tNNN$/, idRegex: [ - // html - /html-manifest#tc001$/, - /html-manifest#tc002$/, - /html-manifest#tc003$/, - /html-manifest#tc004$/, ] }, fn: 'compact', @@ -63,33 +58,8 @@ const TEST_TYPES = { /expand-manifest#t0129$/, // html - /html-manifest#te001$/, - /html-manifest#te002$/, - /html-manifest#te003$/, - /html-manifest#te004$/, - /html-manifest#te005$/, - /html-manifest#te006$/, - /html-manifest#te007$/, - /html-manifest#te010$/, - /html-manifest#te011$/, - /html-manifest#te012$/, - /html-manifest#te013$/, - /html-manifest#te014$/, - /html-manifest#te015$/, - /html-manifest#te016$/, - /html-manifest#te017$/, - /html-manifest#te018$/, - /html-manifest#te019$/, - /html-manifest#te020$/, - /html-manifest#te021$/, - /html-manifest#te022$/, - /html-manifest#tex01$/, - // HTML extraction - /expand-manifest#thc01$/, - /expand-manifest#thc02$/, - /expand-manifest#thc03$/, - /expand-manifest#thc04$/, - /expand-manifest#thc05$/, + /html-manifest#tex01$/, // XHTML + /html-manifest#te010$/, // unescaped content // remote /remote-doc-manifest#t0013$/, // HTML ] @@ -111,9 +81,6 @@ const TEST_TYPES = { //MMM-manifest#tNNN$/, idRegex: [ // html - /html-manifest#tf001$/, - /html-manifest#tf002$/, - /html-manifest#tf003$/, /html-manifest#tf004$/, ] }, @@ -189,26 +156,7 @@ const TEST_TYPES = { /toRdf-manifest#twf05$/, // html - /html-manifest#tr001$/, - /html-manifest#tr002$/, - /html-manifest#tr003$/, - /html-manifest#tr004$/, - /html-manifest#tr005$/, - /html-manifest#tr006$/, - /html-manifest#tr007$/, /html-manifest#tr010$/, - /html-manifest#tr011$/, - /html-manifest#tr012$/, - /html-manifest#tr013$/, - /html-manifest#tr014$/, - /html-manifest#tr015$/, - /html-manifest#tr016$/, - /html-manifest#tr017$/, - /html-manifest#tr018$/, - /html-manifest#tr019$/, - /html-manifest#tr020$/, - /html-manifest#tr021$/, - /html-manifest#tr022$/, // Invalid Statement /toRdf-manifest#te075$/, /toRdf-manifest#te111$/, @@ -894,11 +842,11 @@ function createDocumentLoader(test) { 'https://w3c.github.io/json-ld-api/tests', 'https://w3c.github.io/json-ld-framing/tests' ]; - const localLoader = function(url) { + const localLoader = function(url, options) { // always load remote-doc tests remotely in node // NOTE: disabled due to github pages issues. //if(options.nodejs && test.manifest.name === 'Remote document') { - // return jsonld.documentLoader(url); + // return jsonld.documentLoader(url, options); //} // FIXME: this check only works for main test suite and will not work if: @@ -915,25 +863,34 @@ function createDocumentLoader(test) { } // load remotely - return jsonld.documentLoader(url); + return jsonld.documentLoader(url, options); }; return localLoader; function loadLocally(url) { - const doc = {contextUrl: null, documentUrl: url, document: null}; - const options = test.option; + const doc = { + contextUrl: null, + documentUrl: url, + document: null, + contentType: null, + profile: null + }; + const options = test.option || {}; + doc.contentType = options.contentType; + if(!doc.contentType && url.indexOf('.jsonld', url.length - 7) !== -1) { + doc.contentType = 'application/ld+json'; + } + if(!doc.contentType && url.indexOf('.json', url.length - 5) !== -1) { + doc.contentType = 'application/json'; + } + if(!doc.contentType && url.indexOf('.html', url.length - 5) !== -1) { + doc.contentType = 'text/html'; + } if(options && url === test.base) { if('redirectTo' in options && parseInt(options.httpStatus, 10) >= 300) { doc.documentUrl = test.manifest.baseIri + options.redirectTo; } else if('httpLink' in options) { - let contentType = options.contentType || null; - if(!contentType && url.indexOf('.jsonld', url.length - 7) !== -1) { - contentType = 'application/ld+json'; - } - if(!contentType && url.indexOf('.json', url.length - 5) !== -1) { - contentType = 'application/json'; - } let linkHeader = options.httpLink; if(Array.isArray(linkHeader)) { linkHeader = linkHeader.join(','); @@ -941,7 +898,7 @@ function createDocumentLoader(test) { const linkHeaders = jsonld.parseLinkHeader(linkHeader); const linkedContext = linkHeaders['http://www.w3.org/ns/json-ld#context']; - if(linkedContext && contentType !== 'application/ld+json') { + if(linkedContext && doc.contentType !== 'application/ld+json') { if(Array.isArray(linkedContext)) { throw {name: 'multiple context link headers'}; } @@ -951,7 +908,8 @@ function createDocumentLoader(test) { // If not JSON-LD, alternate may point there if(linkHeaders['alternate'] && linkHeaders['alternate'].type == 'application/ld+json' && - !(contentType || '').match(/^application\/(\w*\+)?json$/)) { + !(doc.contentType || '').match(/^application\/(\w*\+)?json$/)) { + doc.contentType = 'application/ld+json'; doc.documentUrl = prependBase(url, linkHeaders['alternate'].target); } } @@ -975,12 +933,22 @@ function createDocumentLoader(test) { }); } - return p.then(readJson).then(json => { - doc.document = json; - return doc; - }).catch(() => { - throw {name: 'loading document failed', url}; - }); + // parse JSON, if appropriate + if(!doc.contentType || doc.contentType.includes('json')) { + return p.then(readJson).then(json => { + doc.document = json; + return doc; + }).catch(() => { + throw {name: 'loading document failed', url}; + }); + } else { + return p.then(readFile).then(content => { + doc.document = content; + return doc; + }).catch(() => { + throw {name: 'loading document failed', url}; + }); + } } } From 7730b10e35fa6d38c146e9adaf037d055dba5bcf Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 27 Dec 2019 17:31:33 -0800 Subject: [PATCH 02/18] Set default for expandAllScripts to true for flatten and toRdf. --- lib/jsonld.js | 4 +++- tests/test-common.js | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/jsonld.js b/lib/jsonld.js index b1fc0976..b99f271f 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -379,6 +379,7 @@ jsonld.flatten = async function(input, ctx, options) { // set default options options = _setDefaults(options, { base: _isString(input) ? input : '', + extractAllScripts: true, contextResolver: new ContextResolver( {sharedCache: _resolvedContextCache}) }); @@ -664,6 +665,7 @@ jsonld.toRDF = async function(input, options) { // set default options options = _setDefaults(options, { base: _isString(input) ? input : '', + extractAllScripts: true, skipExpansion: false, contextResolver: new ContextResolver( {sharedCache: _resolvedContextCache}) @@ -932,7 +934,7 @@ jsonld.get = async function(url, options) { remoteDoc }); } - if(!options.extractAllScripts) { + if(frag || !options.extractAllScripts) { remoteDoc.document = remoteDoc.document[0]; } } else { diff --git a/tests/test-common.js b/tests/test-common.js index 7f94542c..55c48b24 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -80,8 +80,6 @@ const TEST_TYPES = { // NOTE: idRegex format: //MMM-manifest#tNNN$/, idRegex: [ - // html - /html-manifest#tf004$/, ] }, fn: 'flatten', @@ -157,6 +155,7 @@ const TEST_TYPES = { // html /html-manifest#tr010$/, + /html-manifest#tr010$/, // unescaped content // Invalid Statement /toRdf-manifest#te075$/, /toRdf-manifest#te111$/, From 59da13febed23eb0f24b5b72a25e2951a4f05fa1 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 27 Dec 2019 17:34:36 -0800 Subject: [PATCH 03/18] Fix lint errors. --- lib/jsonld.js | 4 ++-- lib/util.js | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/jsonld.js b/lib/jsonld.js index b99f271f..e0a04583 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -42,7 +42,7 @@ const LRU = require('lru-cache'); const NQuads = require('./NQuads'); const Rdfa = require('./Rdfa'); -const {prependBase: _prependBase} = require ('./url'); +const {prependBase: _prependBase} = require('./url'); const {expand: _expand} = require('./expand'); const {flatten: _flatten} = require('./flatten'); const {fromRDF: _fromRDF} = require('./fromRdf'); @@ -943,7 +943,7 @@ jsonld.get = async function(url, options) { } } catch(e) { if(e.name === 'jsonld.InvalidScriptElement') { - throw(e) + throw (e); } else { throw new JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', diff --git a/lib/util.js b/lib/util.js index b813f849..b138abb8 100644 --- a/lib/util.js +++ b/lib/util.js @@ -144,7 +144,8 @@ api.parseLinkHeader = header => { }; /** - * Parses a content-type header. The results will be key'd by the value of "rel". + * Parses a content-type header. + * The results will be key'd by the value of "rel". * * Accept: application/ld+json * @@ -152,9 +153,10 @@ api.parseLinkHeader = header => { * * Accept: application/ld+json;profile=http://www.w3.org/ns/json-ld#context * - * Parses as: ["application/ld+json", {profile: "http://www.w3.org/ns/json-ld#context"}] + * Parses as: ["application/ld+json", + * {profile: "http://www.w3.org/ns/json-ld#context"}] * - * If there is more than one + * If there is more than one * * @param header the content-type header to parse. */ From bdc9343dc07faba7548e488098126d995dafbdc4 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sat, 28 Dec 2019 10:25:19 -0800 Subject: [PATCH 04/18] Don't extract all scripts if there is a fragment identifier. --- tests/test-common.js | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test-common.js b/tests/test-common.js index 55c48b24..a61a2474 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -154,7 +154,6 @@ const TEST_TYPES = { /toRdf-manifest#twf05$/, // html - /html-manifest#tr010$/, /html-manifest#tr010$/, // unescaped content // Invalid Statement /toRdf-manifest#te075$/, From 7b29cb3fb47c4fd7bfdd076bad11857fc88ead55 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sat, 28 Dec 2019 12:28:22 -0800 Subject: [PATCH 05/18] Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5039de4c..d5cfb193 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,13 @@ ## 2.0.1 - 2019-12-10 +### Added +- Support for extracting JSON-LD from HTML, when the xmldom package is loaded. + +### Changed +- Update calls to documentLoader to pass options +- Pass requestProfile in Accept header when loading documents + ### Fixed - JSON literal value handling issues. From a41f081d755ae804c8c371957516e78383c5c43c Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 31 Dec 2019 11:14:38 -0800 Subject: [PATCH 06/18] Add contentType check to XHR documentLoader link header test. --- lib/documentLoaders/xhr.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index cb27a9c7..116e8aff 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -89,7 +89,8 @@ module.exports = ({ // handle Link Header (avoid unsafe header warning by existence testing) let linkHeader; - if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) { + if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders()) && + contentType !== 'application/ld+json') { linkHeader = req.getResponseHeader('Link'); } if(linkHeader && contentType !== 'application/ld+json') { From 03c4834467f335ebdedb92b469bdb26e4fa678b2 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sun, 5 Jan 2020 13:41:05 -0800 Subject: [PATCH 07/18] Test for DOMParser slightly more complicated. Skip HTML tests if there is no DOMParser, or loading the module raises an exception. Allows Karma tests to pass. --- lib/jsonld.js | 9 ++++++++- tests/test-common.js | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/lib/jsonld.js b/lib/jsonld.js index e0a04583..3c2e4709 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -1005,7 +1005,14 @@ jsonld.documentLoaders.xhr = require('./documentLoaders/xhr'); // Optional DOM parser try { - jsonld.domParser = require('xmldom').DOMParser; + jsonld.domParser = require('xmldom').DOMParser || class NoDOMParser { + parseFromString() { + throw new JsonLdError( + 'Could not parse HTML document. ' + + 'HTML parsing not implemented.', 'jsonld.LoadDocumentError', + {code: 'loading document failed'}); + } + }; } catch(e) { jsonld.domParser = class NoDOMParser { parseFromString() { diff --git a/tests/test-common.js b/tests/test-common.js index a61a2474..39107a32 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -25,6 +25,15 @@ const manifest = options.manifest || { filename: '/' }; +let htmlSupport; +try { + // xmldom may load but not have a DOMParser + htmlSupport = !!require('xmldom').DOMParser; +} catch(e) { + htmlSupport = false; +} +console.log("HTML Support: " + htmlSupport); + const TEST_TYPES = { 'jld:CompactTest': { skip: { @@ -385,6 +394,13 @@ function addTest(manifest, test, tests) { self.skip(); } + // if xmldom not loaded, skip HTML tests + if(isJsonLdType(test, 'jld:HtmlTest') && !htmlSupport) { + console.log('Skipping test due to lack of HTML support:', + {id: test['@id'], name: test.name}); + self.skip(); + } + // skip based on test type if(isJsonLdType(test, SKIP_TESTS)) { if(options.verboseSkip) { From 5fa1b2ab2cbf61a4e26afe5973ea3bd28b48de4c Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 6 Jan 2020 12:12:35 -0800 Subject: [PATCH 08/18] Fix CHANGELOG. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5cfb193..9186d19a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,7 @@ - Top level `@graph` omitted if `omitGraph` is `true`. - Check for invalid values of `@embed`. - Support default values for `@type` when framing. +- Support for extracting JSON-LD from HTML, when the xmldom package is loaded. ## 2.0.2 - 2020-01-17 From 5315d679d69ffca89419cabbd1983286410928b5 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 6 Jan 2020 12:17:25 -0800 Subject: [PATCH 09/18] Apply suggestions from @dlongley code review Co-Authored-By: Dave Longley --- lib/documentLoaders/node.js | 2 +- lib/documentLoaders/xhr.js | 2 +- lib/jsonld.js | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js index 717d7e5e..057a3a18 100644 --- a/lib/documentLoaders/node.js +++ b/lib/documentLoaders/node.js @@ -68,7 +68,7 @@ module.exports = ({ // add any optional requestProfile if(options.requestProfile) { headers.Accept = - headers.Accept + ", application/ld+json;profile=${options.requestProfile}"; + headers.Accept + `, application/ld+json;profile=${options.requestProfile}`; } let result; diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index 116e8aff..1c631623 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -52,7 +52,7 @@ module.exports = ({ // add any optional requestProfile if(options.requestProfile) { headers.Accept = - headers.Accept + ", application/ld+json;profile=${options.requestProfile}"; + headers.Accept + `, application/ld+json;profile=${options.requestProfile}`; } let req; diff --git a/lib/jsonld.js b/lib/jsonld.js index 3c2e4709..b55f9adf 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -865,7 +865,7 @@ jsonld.documentLoader = async url => { * @param url the URL to fetch. * @param [options] the options to use: * [documentLoader] the document loader to use. - * [extractAllScripts] concatenates all matching script elements.. + * [extractAllScripts] concatenates all matching script elements. * [profile] used when selecting from HTML script elements. * [requestProfile] one or more profile IRIs to use in the request. * @@ -928,7 +928,7 @@ jsonld.get = async function(url, options) { } if(frag && remoteDoc.document.length === 0) { throw new JsonLdError( - 'No script tag found with id=${frag}.', + `No script tag found with id=${frag}.`, 'jsonld.InvalidScriptElement', { code: 'invalid script element', remoteDoc From 31f192de0713de9b89a2c06dee06c00d959d8cda Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 6 Jan 2020 12:22:07 -0800 Subject: [PATCH 10/18] More changes suggested by @dlongley. --- lib/documentLoaders/xhr.js | 4 ++-- lib/jsonld.js | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index 1c631623..391fcefa 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -89,8 +89,8 @@ module.exports = ({ // handle Link Header (avoid unsafe header warning by existence testing) let linkHeader; - if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders()) && - contentType !== 'application/ld+json') { + if(contentType !== 'application/ld+json' && + REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) { linkHeader = req.getResponseHeader('Link'); } if(linkHeader && contentType !== 'application/ld+json') { diff --git a/lib/jsonld.js b/lib/jsonld.js index b55f9adf..e3ed0d80 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -943,16 +943,17 @@ jsonld.get = async function(url, options) { } } catch(e) { if(e.name === 'jsonld.InvalidScriptElement') { + // pass error detected in HTML decode throw (e); - } else { - throw new JsonLdError( - 'Could not retrieve a JSON-LD document from the URL.', - 'jsonld.LoadDocumentError', { - code: 'loading document failed', - cause: e, - remoteDoc - }); } + // otherwise, general loading error + throw new JsonLdError( + 'Could not retrieve a JSON-LD document from the URL.', + 'jsonld.LoadDocumentError', { + code: 'loading document failed', + cause: e, + remoteDoc + }); } return remoteDoc; From 9d5ec196379998cd4c5c038c103016c97cbd4fb6 Mon Sep 17 00:00:00 2001 From: "David I. Lehn" Date: Tue, 28 Jan 2020 20:20:38 -0500 Subject: [PATCH 11/18] Fix changelog. --- CHANGELOG.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9186d19a..1d5febaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,8 @@ - `omitGraph` based on processingMode. - Replaced `removePreserve` with `cleanupPreserve` and `cleanupNulls`. - Remove unused framing `graphStack` code that was removed from the spec. +- Update calls to `documentLoader` to pass options. +- Pass `requestProfile` in `Accept` header when loading documents. ### Added - Support for `"@import"`. @@ -88,13 +90,6 @@ ## 2.0.1 - 2019-12-10 -### Added -- Support for extracting JSON-LD from HTML, when the xmldom package is loaded. - -### Changed -- Update calls to documentLoader to pass options -- Pass requestProfile in Accept header when loading documents - ### Fixed - JSON literal value handling issues. From 2664ac799aef9971ac8efb5c852880cefe2c41f0 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 17 Feb 2020 14:00:43 -0800 Subject: [PATCH 12/18] Use content-type package, instead of purpose-defined parseContentType method. --- lib/documentLoaders/node.js | 11 ++++++----- lib/documentLoaders/xhr.js | 12 ++++++------ lib/jsonld.js | 5 +++++ lib/util.js | 30 ------------------------------ package.json | 1 + 5 files changed, 18 insertions(+), 41 deletions(-) diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js index 057a3a18..a7547389 100644 --- a/lib/documentLoaders/node.js +++ b/lib/documentLoaders/node.js @@ -3,10 +3,11 @@ */ 'use strict'; +const contentType = require ('content-type'); + const { parseLinkHeader, - buildHeaders, - parseContentTypeHeader + buildHeaders } = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); @@ -88,14 +89,14 @@ module.exports = ({ } const {res, body} = result; - const {contentType, params} = parseContentTypeHeader(res.headers['content-type']); + const {type, parameters} = contentType.parse(res); doc = { contextUrl: null, documentUrl: url, document: body || null, - contentType: contentType, - profile: params.profile + contentType: type, + profile: parameters.profile }; // separate profile from content-type diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index 391fcefa..7c973999 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -3,10 +3,11 @@ */ 'use strict'; +const contentType = require ('content-type'); + const { parseLinkHeader, - buildHeaders, - parseContentTypeHeader + buildHeaders } = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); @@ -75,15 +76,14 @@ module.exports = ({ }); } - const {contentType, params} = - parseContentTypeHeader(req.getResponseHeader('Content-Type')); + const {type, parameters} = contentType.parse(req); let doc = { contextUrl: null, documentUrl: url, document: req.response, - contentType: contentType, - profile: params.profile + contentType: type, + profile: parameters.profile }; let alternate = null; diff --git a/lib/jsonld.js b/lib/jsonld.js index e3ed0d80..8ed2e060 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -34,6 +34,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ const canonize = require('rdf-canonize'); +const contentType = require ('content-type'); const util = require('./util'); const ContextResolver = require('./ContextResolver'); const IdentifierIssuer = util.IdentifierIssuer; @@ -908,6 +909,10 @@ jsonld.get = async function(url, options) { for(let i = 0; i < scripts.length; i++) { const script = scripts[i]; // only application/ld+json + const {type, parameters} = contentType.parse(script.getAttribute('type')); + if(type !== 'application/ld+json') { + continue; + } if(!script.getAttribute('type').startsWith('application/ld+json')) { continue; } diff --git a/lib/util.js b/lib/util.js index b138abb8..c07f7669 100644 --- a/lib/util.js +++ b/lib/util.js @@ -143,36 +143,6 @@ api.parseLinkHeader = header => { return rval; }; -/** - * Parses a content-type header. - * The results will be key'd by the value of "rel". - * - * Accept: application/ld+json - * - * Parses as: ["application/ld+json", {}] - * - * Accept: application/ld+json;profile=http://www.w3.org/ns/json-ld#context - * - * Parses as: ["application/ld+json", - * {profile: "http://www.w3.org/ns/json-ld#context"}] - * - * If there is more than one - * - * @param header the content-type header to parse. - */ -api.parseContentTypeHeader = header => { - const [type, ...rest] = header.split(';'); - const params = {}; - const rval = [type.trim(), params]; - - // assign parameters - for(const paramString of rest) { - const [param, value] = paramString.split('='); - params[param.trim().toLowerCase()] = value.trim(); - } - return rval; -}; - /** * Throws an exception if the given value is not a valid @type value. * diff --git a/package.json b/package.json index adb65fab..0505df57 100644 --- a/package.json +++ b/package.json @@ -31,6 +31,7 @@ ], "dependencies": { "canonicalize": "^1.0.1", + "content-type": "^1.0.4", "lru-cache": "^5.1.1", "object.fromentries": "^2.0.2", "rdf-canonize": "^1.0.2", From 868dae64d2ec1cf8b951cc6571ec2215780e7e3e Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 2 Mar 2020 17:52:14 -0800 Subject: [PATCH 13/18] Update lib/documentLoaders/xhr.js Co-Authored-By: David I. Lehn --- lib/documentLoaders/xhr.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index 7c973999..b4e6223d 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -3,7 +3,7 @@ */ 'use strict'; -const contentType = require ('content-type'); +const contentType = require('content-type'); const { parseLinkHeader, From e38ba63fd2e1910fcc4f7a4014d5e499536df569 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 2 Mar 2020 17:57:06 -0800 Subject: [PATCH 14/18] Update lib/documentLoaders/node.js Co-Authored-By: David I. Lehn --- lib/documentLoaders/node.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js index a7547389..f6022b6d 100644 --- a/lib/documentLoaders/node.js +++ b/lib/documentLoaders/node.js @@ -3,7 +3,7 @@ */ 'use strict'; -const contentType = require ('content-type'); +const contentType = require('content-type'); const { parseLinkHeader, From 9a9bbdb6783ddb854fd81ff1a00167e9489127dc Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 3 Mar 2020 07:32:19 -0800 Subject: [PATCH 15/18] Update lib/jsonld.js Co-Authored-By: David I. Lehn --- lib/jsonld.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/jsonld.js b/lib/jsonld.js index 8ed2e060..1b17254e 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -34,7 +34,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ const canonize = require('rdf-canonize'); -const contentType = require ('content-type'); +const contentType = require('content-type'); const util = require('./util'); const ContextResolver = require('./ContextResolver'); const IdentifierIssuer = util.IdentifierIssuer; From 7cc3d1c955206b6b85539620b1fa9ab905d2b441 Mon Sep 17 00:00:00 2001 From: "David I. Lehn" Date: Fri, 6 Mar 2020 00:08:18 -0500 Subject: [PATCH 16/18] Fix lint issues. --- lib/frame.js | 7 ++++--- lib/jsonld.js | 2 +- tests/test-common.js | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/frame.js b/lib/frame.js index 0a10f217..27675fb7 100644 --- a/lib/frame.js +++ b/lib/frame.js @@ -231,9 +231,10 @@ api.frame = (state, subjects, frame, parent, property = null) => { // recurse into list if(graphTypes.isList(o)) { - const subframe = (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ? - frame[prop][0]['@list'] : - _createImplicitFrame(flags); + const subframe = + (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ? + frame[prop][0]['@list'] : + _createImplicitFrame(flags); // add empty list const list = {'@list': []}; diff --git a/lib/jsonld.js b/lib/jsonld.js index 1b17254e..87628644 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -909,7 +909,7 @@ jsonld.get = async function(url, options) { for(let i = 0; i < scripts.length; i++) { const script = scripts[i]; // only application/ld+json - const {type, parameters} = contentType.parse(script.getAttribute('type')); + const {type} = contentType.parse(script.getAttribute('type')); if(type !== 'application/ld+json') { continue; } diff --git a/tests/test-common.js b/tests/test-common.js index 39107a32..1296f2ca 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -32,7 +32,7 @@ try { } catch(e) { htmlSupport = false; } -console.log("HTML Support: " + htmlSupport); +console.log('HTML Support: ' + htmlSupport); const TEST_TYPES = { 'jld:CompactTest': { From 66a2092d2d436593516f8066b52b9de05d9ece57 Mon Sep 17 00:00:00 2001 From: "David I. Lehn" Date: Fri, 6 Mar 2020 00:09:40 -0500 Subject: [PATCH 17/18] Update error code. - Updated tests switch from "invalid script element" to "loading document failed". --- lib/jsonld.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/jsonld.js b/lib/jsonld.js index 87628644..8583b90f 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -935,7 +935,7 @@ jsonld.get = async function(url, options) { throw new JsonLdError( `No script tag found with id=${frag}.`, 'jsonld.InvalidScriptElement', { - code: 'invalid script element', + code: 'loading document failed', remoteDoc }); } From 11b68b97b514f980afaf857035c163df352e4621 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Thu, 9 Apr 2020 15:09:59 -0700 Subject: [PATCH 18/18] Error when loading HTML and no script element exists. --- lib/jsonld.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/jsonld.js b/lib/jsonld.js index 8583b90f..1d1a6d32 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -940,6 +940,14 @@ jsonld.get = async function(url, options) { }); } if(frag || !options.extractAllScripts) { + if(!remoteDoc.document[0]) { + throw new JsonLdError( + `No script tag found.`, + 'jsonld.InvalidScriptElement', { + code: 'loading document failed', + remoteDoc + }); + } remoteDoc.document = remoteDoc.document[0]; } } else {