Skip to content

Commit 38e14bb

Browse files
feat(Experimental): add support for metaformats (microformats#229)
* feat(Experimental): add support for metaformats * implement metaformats parsing Closes microformats#224 * chore(deps): update micoformats/test (microformats#1) should fix test ordering issue --------- Co-authored-by: aimee-gm <[email protected]>
1 parent e5b6070 commit 38e14bb

33 files changed

+966
-21
lines changed

.github/workflows/build_and_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
- uses: actions/checkout@v2
1818
- uses: actions/setup-node@v1
1919
with:
20-
node-version: 16
20+
node-version: 18
2121
- name: Install dependencies
2222
run: yarn
2323
- name: Lint code

.nvmrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
16
1+
18

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ These are sourced from the element themselves, a parent microformat, the HTML do
109109

110110
When parsing microformats for text content, all the consecutive whitespace is collapsed into a single space. `<br/>` and `<p>` tags are treated as line breaks.
111111

112+
#### `metaformats`
113+
114+
Enables fallback to [metaformats](https://microformats.org/wiki/metaformats) parsing which looks at `<meta>` tags to infer content.
115+
112116
## Contributing
113117

114118
See our [contributing guidelines](./CONTRIBUTING.md) for more information.

demo/demo.js

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ window.parseHtml = () => {
3232
const baseUrl = document.getElementById("base-url").value;
3333
const lang = document.getElementById("lang").checked;
3434
const textContent = document.getElementById("textContent").checked;
35+
const metaformats = document.getElementById("metaformats").checked;
3536

36-
return parse(html, { baseUrl, experimental: { lang, textContent } });
37+
return parse(html, {
38+
baseUrl,
39+
experimental: { lang, textContent, metaformats },
40+
});
3741
};

demo/index.tpl.html

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,16 @@ <h3>Experimental options</h3>
7272
/>
7373
<span>Better text content</span>
7474
</label>
75+
<label>
76+
<input
77+
type="checkbox"
78+
name="metaformats"
79+
id="metaformats"
80+
value="true"
81+
checked
82+
/>
83+
<span>Metaformats parsing</span>
84+
</label>
7585
</p>
7686

7787
<div class="submit">

src/helpers/metaformats.ts

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
import { Document, Element } from "parse5";
2+
3+
import { MicroformatRoot, ParsingOptions } from "../types";
4+
import {
5+
getAttributeIfTag,
6+
getAttributeValue,
7+
hasRelIntersect,
8+
} from "./attributes";
9+
import { isEnabled } from "./experimental";
10+
import { isElement, isTag } from "./nodeMatchers";
11+
12+
/** Special key for title tag in meta collection */
13+
const TITLE_TAG_KEY = "<title>";
14+
const CANONICAL_URL_KEY = "<canonical>";
15+
const MEDIA_TYPES = ["image", "video", "audio"];
16+
17+
interface ComplexMediaMeta {
18+
value: string;
19+
alt: string;
20+
}
21+
type MetaTagContent = string | ComplexMediaMeta;
22+
23+
/**
24+
* Creates a normalized store for meta tags
25+
*/
26+
const initializeMetaContentCollection = (): MetaContentCollection => {
27+
/**
28+
* Collection of all relevant meta tag content
29+
* Since tag order isn't guaranteed, need to collect all value before applying defaults
30+
*/
31+
const metaContent: Record<string, MetaTagContent[]> = {};
32+
33+
/**
34+
* Gets the values of the first property found
35+
* @param properties Array of properties to look for, preferred item first
36+
*/
37+
const get = (properties: string[]) => {
38+
for (const key of properties) {
39+
if (metaContent[key]) {
40+
return metaContent[key];
41+
}
42+
}
43+
return;
44+
};
45+
46+
/**
47+
* Stores meta tag values.
48+
*
49+
* Includes following normalization rules:
50+
* - Duplicates are removed from repeated (array) tags
51+
* - src, url, and secure_url media tags are treated same as base (e.g. og:image:url -> og:image)
52+
* - Alt text is added as property on last image url
53+
*/
54+
const set = (key: string, value: string) => {
55+
// Split tag name to normalize values like "og:video:url"
56+
const [domain, type, subtype] = key.split(":");
57+
58+
// Media tags specific parsing
59+
if (
60+
(domain === "og" || domain === "twitter") &&
61+
MEDIA_TYPES.includes(type)
62+
) {
63+
if (subtype === "alt") {
64+
const existingMedia = metaContent[`${domain}:${type}`];
65+
66+
if (existingMedia?.length) {
67+
const last = existingMedia.pop();
68+
69+
if (typeof last === "string") {
70+
existingMedia.push({ value: last, alt: value });
71+
} else if (last) {
72+
// Found duplicate alt text tag so re-inserting existing
73+
// last should always be object. if condition added for types
74+
existingMedia.push(last);
75+
}
76+
}
77+
78+
return; // Stop as alt text is already added
79+
} else if (["url", "secure_url"].includes(subtype)) {
80+
// Mutate key to normalize different url values
81+
// Duplicates will be cleaned up on insertion
82+
key = `${domain}:${type}`;
83+
}
84+
}
85+
const existing = metaContent[key];
86+
87+
if (existing) {
88+
const isDuplicate = existing
89+
.map((existingValue) =>
90+
typeof existingValue === "string"
91+
? existingValue
92+
: existingValue.value
93+
)
94+
.some((existingValue) => value === existingValue);
95+
96+
if (!isDuplicate) {
97+
metaContent[key].push(value);
98+
} // Else ignore duplicates
99+
} else {
100+
metaContent[key] = [value];
101+
}
102+
};
103+
104+
return {
105+
metaContent,
106+
set,
107+
get,
108+
};
109+
};
110+
111+
interface MetaContentCollection {
112+
metaContent: Record<string, MetaTagContent[]>;
113+
set: (key: string, value: string) => void;
114+
get: (properties: string[]) => MetaTagContent[] | undefined;
115+
}
116+
117+
const collectMetaTags = (head: Element): MetaContentCollection => {
118+
const metaTags = initializeMetaContentCollection();
119+
120+
for (const i in head.childNodes) {
121+
const child = head.childNodes[i];
122+
123+
if (!isElement(child)) {
124+
continue;
125+
}
126+
127+
const content = getAttributeIfTag(child, ["meta"], "content");
128+
if (content) {
129+
// Tags keys usually use the "name" attribute but open graph uses "property"
130+
// Consider them separately in case a meta tag uses both
131+
// e.g. <meta property="og:title" name="author" content="Johnny Complex" >
132+
const property = getAttributeValue(child, "property");
133+
if (property) {
134+
metaTags.set(property, content);
135+
}
136+
137+
const name = getAttributeValue(child, "name");
138+
if (name && name !== property) {
139+
metaTags.set(name, content);
140+
}
141+
} else if (child.tagName === "title" && "value" in child.childNodes[0]) {
142+
metaTags.set(TITLE_TAG_KEY, child.childNodes[0].value);
143+
} else if (
144+
child.tagName === "link" &&
145+
hasRelIntersect(child, ["canonical"])
146+
) {
147+
const canonicalUrl = getAttributeValue(child, "href");
148+
if (canonicalUrl) {
149+
metaTags.set(CANONICAL_URL_KEY, canonicalUrl);
150+
}
151+
}
152+
}
153+
return metaTags;
154+
};
155+
156+
/**
157+
* Collect meta content into a microformat object
158+
* @param metaTags Previously parsed meta tag collection
159+
* @param options Library parsing options
160+
*/
161+
const combineRoot = (
162+
metaTags: MetaContentCollection,
163+
options: ParsingOptions
164+
): MicroformatRoot[] => {
165+
const item: MicroformatRoot = { properties: {} };
166+
167+
if (isEnabled(options, "lang") && options.inherited.lang) {
168+
item.lang = options.inherited.lang;
169+
}
170+
171+
/**
172+
* Define property on microformat root if values are found
173+
* @param property Key of microformats property
174+
* @param value Array of values for the property. Empty and undefined values are not added.
175+
*/
176+
const setMicroformatProp = (
177+
property: string,
178+
value: MetaTagContent[] = []
179+
) => {
180+
const filteredValue = value.filter(Boolean);
181+
if (filteredValue.length) {
182+
item.properties[property] = filteredValue;
183+
}
184+
};
185+
186+
let impliedRootClass = "h-entry";
187+
const [ogType] = metaTags.get(["og:type"]) ?? [];
188+
if (ogType && typeof ogType === "string") {
189+
if (ogType === "profile") {
190+
impliedRootClass = "h-card";
191+
} else if (["music", "video"].some((type) => ogType.includes(type))) {
192+
impliedRootClass = "h-cite";
193+
} // else h-entry
194+
}
195+
item.type = [impliedRootClass];
196+
197+
setMicroformatProp(
198+
"name",
199+
metaTags.get(["og:title", "twitter:title", TITLE_TAG_KEY])
200+
);
201+
setMicroformatProp(
202+
"summary",
203+
metaTags.get(["og:description", "twitter:description", "description"])
204+
);
205+
setMicroformatProp("featured", metaTags.get(["og:image", "twitter:image"]));
206+
setMicroformatProp("video", metaTags.get(["og:video", "twitter:video"]));
207+
setMicroformatProp("audio", metaTags.get(["og:audio", "twitter:audio"]));
208+
setMicroformatProp(
209+
"published",
210+
metaTags.get(["article:published_time", "date"])
211+
);
212+
setMicroformatProp("updated", metaTags.get(["article:modified_time"]));
213+
setMicroformatProp("author", metaTags.get(["article:author", "author"]));
214+
setMicroformatProp("url", metaTags.get(["og:url", CANONICAL_URL_KEY]));
215+
216+
// Publication properties useful for h-cite
217+
setMicroformatProp(
218+
"publication",
219+
metaTags.get(["og:site_name", "publisher"])
220+
);
221+
222+
if (impliedRootClass === "h-card") {
223+
setMicroformatProp("given-name", metaTags.get(["profile:first_name"]));
224+
setMicroformatProp("family-name", metaTags.get(["profile:last_name"]));
225+
}
226+
227+
if (Object.keys(item.properties).length === 0) {
228+
return [];
229+
}
230+
231+
return [item];
232+
};
233+
234+
export const parseMetaformats = (
235+
doc: Document,
236+
options: ParsingOptions
237+
): MicroformatRoot[] => {
238+
// Per validation, html element will always be found
239+
const html = doc.childNodes.find(isTag("html"));
240+
const head = html?.childNodes.find(isTag("head"));
241+
242+
// Per manual testing, head will always be defined
243+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
244+
const metaContent = collectMetaTags(head!);
245+
return combineRoot(metaContent, options);
246+
};

src/helpers/nodeMatchers.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ const propClassRegex = classRegex("(p|e|u|dt)");
2020
export const isElement = (node: Node): node is Element =>
2121
"tagName" in node && "childNodes" in node;
2222

23+
export const isTag =
24+
(tagName: string) =>
25+
(node: Node): node is Element =>
26+
isElement(node) && node.tagName === tagName;
27+
2328
export const isTextNode = (node: Node): node is TextNode => "value" in node;
2429

2530
export const isMicroformatV2Root = (node: Element): boolean =>

src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export interface Options {
77
experimental?: {
88
lang?: boolean;
99
textContent?: boolean;
10+
metaformats?: boolean;
1011
};
1112
}
1213

src/microformats/property.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ export const parseP = (node: Element, options: ParsingOptions): string =>
3434
getAttributeIfTag(node, ["abbr", "link"], "title") ??
3535
getAttributeIfTag(node, ["data"], "value") ??
3636
getAttributeIfTag(node, ["img", "area"], "alt") ??
37+
getAttributeIfTag(node, ["meta"], "content") ??
3738
textContent(node, options);
3839

3940
export const parseU = (
@@ -49,6 +50,7 @@ export const parseU = (
4950
valueClassPattern(node, options) ??
5051
getAttributeIfTag(node, ["abbr"], "title") ??
5152
getAttributeIfTag(node, ["data", "input"], "value") ??
53+
getAttributeIfTag(node, ["meta"], "content") ??
5254
textContent(node, options);
5355

5456
if (typeof url === "string" && isLocalLink(url)) {
@@ -63,6 +65,7 @@ const parseDt = (node: Element, options: ParsingOptions): string =>
6365
getAttributeIfTag(node, ["time", "ins", "del"], "datetime") ??
6466
getAttributeIfTag(node, ["abbr"], "title") ??
6567
getAttributeIfTag(node, ["data", "input"], "value") ??
68+
getAttributeIfTag(node, ["meta"], "content") ??
6669
textContent(node, options);
6770

6871
export const parseE = (node: Element, options: ParsingOptions): Html => {

src/parser.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import { isMicroformatRoot } from "./helpers/nodeMatchers";
66
import { ParsedDocument, ParserOptions, ParsingOptions } from "./types";
77
import { validateParsedHtml } from "./validator";
88
import { documentSetup } from "./helpers/documentSetup";
9+
import { parseMetaformats } from "./helpers/metaformats";
10+
import { isEnabled } from "./helpers/experimental";
911

1012
export const parser = (
1113
html: string,
@@ -22,12 +24,17 @@ export const parser = (
2224
idRefs,
2325
inherited: { roots: [], lang },
2426
};
27+
let items = findChildren(doc, isMicroformatRoot).map((mf) =>
28+
parseMicroformat(mf, parsingOptions)
29+
);
30+
31+
if (items.length === 0 && isEnabled(parsingOptions, "metaformats")) {
32+
items = parseMetaformats(doc, parsingOptions);
33+
}
2534

2635
return {
2736
rels,
2837
"rel-urls": relUrls,
29-
items: findChildren(doc, isMicroformatRoot).map((mf) =>
30-
parseMicroformat(mf, parsingOptions)
31-
),
38+
items,
3239
};
3340
};

src/rels/rels.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ export const parseRel = (
4343
relUrls[href] = { rels: [rel], text };
4444
} else if (!relUrls[href].rels.includes(rel)) {
4545
relUrls[href].rels.push(rel);
46+
relUrls[href].rels.sort();
4647
}
4748

4849
if (text && !relUrls[href].text) {

0 commit comments

Comments
 (0)