|
| 1 | +import { Document, Element } from "parse5"; |
| 2 | + |
| 3 | +import { MicroformatRoot, ParsingOptions } from "../types"; |
| 4 | +import { |
| 5 | + getAttributeIfTag, |
| 6 | + getAttributeValue, |
| 7 | + hasRelIntersect, |
| 8 | +} from "./attributes"; |
| 9 | +import { isEnabled } from "./experimental"; |
| 10 | +import { isElement, isTag } from "./nodeMatchers"; |
| 11 | + |
| 12 | +/** Special key for title tag in meta collection */ |
| 13 | +const TITLE_TAG_KEY = "<title>"; |
| 14 | +const CANONICAL_URL_KEY = "<canonical>"; |
| 15 | +const MEDIA_TYPES = ["image", "video", "audio"]; |
| 16 | + |
| 17 | +interface ComplexMediaMeta { |
| 18 | + value: string; |
| 19 | + alt: string; |
| 20 | +} |
| 21 | +type MetaTagContent = string | ComplexMediaMeta; |
| 22 | + |
| 23 | +/** |
| 24 | + * Creates a normalized store for meta tags |
| 25 | + */ |
| 26 | +const initializeMetaContentCollection = (): MetaContentCollection => { |
| 27 | + /** |
| 28 | + * Collection of all relevant meta tag content |
| 29 | + * Since tag order isn't guaranteed, need to collect all value before applying defaults |
| 30 | + */ |
| 31 | + const metaContent: Record<string, MetaTagContent[]> = {}; |
| 32 | + |
| 33 | + /** |
| 34 | + * Gets the values of the first property found |
| 35 | + * @param properties Array of properties to look for, preferred item first |
| 36 | + */ |
| 37 | + const get = (properties: string[]) => { |
| 38 | + for (const key of properties) { |
| 39 | + if (metaContent[key]) { |
| 40 | + return metaContent[key]; |
| 41 | + } |
| 42 | + } |
| 43 | + return; |
| 44 | + }; |
| 45 | + |
| 46 | + /** |
| 47 | + * Stores meta tag values. |
| 48 | + * |
| 49 | + * Includes following normalization rules: |
| 50 | + * - Duplicates are removed from repeated (array) tags |
| 51 | + * - src, url, and secure_url media tags are treated same as base (e.g. og:image:url -> og:image) |
| 52 | + * - Alt text is added as property on last image url |
| 53 | + */ |
| 54 | + const set = (key: string, value: string) => { |
| 55 | + // Split tag name to normalize values like "og:video:url" |
| 56 | + const [domain, type, subtype] = key.split(":"); |
| 57 | + |
| 58 | + // Media tags specific parsing |
| 59 | + if ( |
| 60 | + (domain === "og" || domain === "twitter") && |
| 61 | + MEDIA_TYPES.includes(type) |
| 62 | + ) { |
| 63 | + if (subtype === "alt") { |
| 64 | + const existingMedia = metaContent[`${domain}:${type}`]; |
| 65 | + |
| 66 | + if (existingMedia?.length) { |
| 67 | + const last = existingMedia.pop(); |
| 68 | + |
| 69 | + if (typeof last === "string") { |
| 70 | + existingMedia.push({ value: last, alt: value }); |
| 71 | + } else if (last) { |
| 72 | + // Found duplicate alt text tag so re-inserting existing |
| 73 | + // last should always be object. if condition added for types |
| 74 | + existingMedia.push(last); |
| 75 | + } |
| 76 | + } |
| 77 | + |
| 78 | + return; // Stop as alt text is already added |
| 79 | + } else if (["url", "secure_url"].includes(subtype)) { |
| 80 | + // Mutate key to normalize different url values |
| 81 | + // Duplicates will be cleaned up on insertion |
| 82 | + key = `${domain}:${type}`; |
| 83 | + } |
| 84 | + } |
| 85 | + const existing = metaContent[key]; |
| 86 | + |
| 87 | + if (existing) { |
| 88 | + const isDuplicate = existing |
| 89 | + .map((existingValue) => |
| 90 | + typeof existingValue === "string" |
| 91 | + ? existingValue |
| 92 | + : existingValue.value |
| 93 | + ) |
| 94 | + .some((existingValue) => value === existingValue); |
| 95 | + |
| 96 | + if (!isDuplicate) { |
| 97 | + metaContent[key].push(value); |
| 98 | + } // Else ignore duplicates |
| 99 | + } else { |
| 100 | + metaContent[key] = [value]; |
| 101 | + } |
| 102 | + }; |
| 103 | + |
| 104 | + return { |
| 105 | + metaContent, |
| 106 | + set, |
| 107 | + get, |
| 108 | + }; |
| 109 | +}; |
| 110 | + |
| 111 | +interface MetaContentCollection { |
| 112 | + metaContent: Record<string, MetaTagContent[]>; |
| 113 | + set: (key: string, value: string) => void; |
| 114 | + get: (properties: string[]) => MetaTagContent[] | undefined; |
| 115 | +} |
| 116 | + |
| 117 | +const collectMetaTags = (head: Element): MetaContentCollection => { |
| 118 | + const metaTags = initializeMetaContentCollection(); |
| 119 | + |
| 120 | + for (const i in head.childNodes) { |
| 121 | + const child = head.childNodes[i]; |
| 122 | + |
| 123 | + if (!isElement(child)) { |
| 124 | + continue; |
| 125 | + } |
| 126 | + |
| 127 | + const content = getAttributeIfTag(child, ["meta"], "content"); |
| 128 | + if (content) { |
| 129 | + // Tags keys usually use the "name" attribute but open graph uses "property" |
| 130 | + // Consider them separately in case a meta tag uses both |
| 131 | + // e.g. <meta property="og:title" name="author" content="Johnny Complex" > |
| 132 | + const property = getAttributeValue(child, "property"); |
| 133 | + if (property) { |
| 134 | + metaTags.set(property, content); |
| 135 | + } |
| 136 | + |
| 137 | + const name = getAttributeValue(child, "name"); |
| 138 | + if (name && name !== property) { |
| 139 | + metaTags.set(name, content); |
| 140 | + } |
| 141 | + } else if (child.tagName === "title" && "value" in child.childNodes[0]) { |
| 142 | + metaTags.set(TITLE_TAG_KEY, child.childNodes[0].value); |
| 143 | + } else if ( |
| 144 | + child.tagName === "link" && |
| 145 | + hasRelIntersect(child, ["canonical"]) |
| 146 | + ) { |
| 147 | + const canonicalUrl = getAttributeValue(child, "href"); |
| 148 | + if (canonicalUrl) { |
| 149 | + metaTags.set(CANONICAL_URL_KEY, canonicalUrl); |
| 150 | + } |
| 151 | + } |
| 152 | + } |
| 153 | + return metaTags; |
| 154 | +}; |
| 155 | + |
| 156 | +/** |
| 157 | + * Collect meta content into a microformat object |
| 158 | + * @param metaTags Previously parsed meta tag collection |
| 159 | + * @param options Library parsing options |
| 160 | + */ |
| 161 | +const combineRoot = ( |
| 162 | + metaTags: MetaContentCollection, |
| 163 | + options: ParsingOptions |
| 164 | +): MicroformatRoot[] => { |
| 165 | + const item: MicroformatRoot = { properties: {} }; |
| 166 | + |
| 167 | + if (isEnabled(options, "lang") && options.inherited.lang) { |
| 168 | + item.lang = options.inherited.lang; |
| 169 | + } |
| 170 | + |
| 171 | + /** |
| 172 | + * Define property on microformat root if values are found |
| 173 | + * @param property Key of microformats property |
| 174 | + * @param value Array of values for the property. Empty and undefined values are not added. |
| 175 | + */ |
| 176 | + const setMicroformatProp = ( |
| 177 | + property: string, |
| 178 | + value: MetaTagContent[] = [] |
| 179 | + ) => { |
| 180 | + const filteredValue = value.filter(Boolean); |
| 181 | + if (filteredValue.length) { |
| 182 | + item.properties[property] = filteredValue; |
| 183 | + } |
| 184 | + }; |
| 185 | + |
| 186 | + let impliedRootClass = "h-entry"; |
| 187 | + const [ogType] = metaTags.get(["og:type"]) ?? []; |
| 188 | + if (ogType && typeof ogType === "string") { |
| 189 | + if (ogType === "profile") { |
| 190 | + impliedRootClass = "h-card"; |
| 191 | + } else if (["music", "video"].some((type) => ogType.includes(type))) { |
| 192 | + impliedRootClass = "h-cite"; |
| 193 | + } // else h-entry |
| 194 | + } |
| 195 | + item.type = [impliedRootClass]; |
| 196 | + |
| 197 | + setMicroformatProp( |
| 198 | + "name", |
| 199 | + metaTags.get(["og:title", "twitter:title", TITLE_TAG_KEY]) |
| 200 | + ); |
| 201 | + setMicroformatProp( |
| 202 | + "summary", |
| 203 | + metaTags.get(["og:description", "twitter:description", "description"]) |
| 204 | + ); |
| 205 | + setMicroformatProp("featured", metaTags.get(["og:image", "twitter:image"])); |
| 206 | + setMicroformatProp("video", metaTags.get(["og:video", "twitter:video"])); |
| 207 | + setMicroformatProp("audio", metaTags.get(["og:audio", "twitter:audio"])); |
| 208 | + setMicroformatProp( |
| 209 | + "published", |
| 210 | + metaTags.get(["article:published_time", "date"]) |
| 211 | + ); |
| 212 | + setMicroformatProp("updated", metaTags.get(["article:modified_time"])); |
| 213 | + setMicroformatProp("author", metaTags.get(["article:author", "author"])); |
| 214 | + setMicroformatProp("url", metaTags.get(["og:url", CANONICAL_URL_KEY])); |
| 215 | + |
| 216 | + // Publication properties useful for h-cite |
| 217 | + setMicroformatProp( |
| 218 | + "publication", |
| 219 | + metaTags.get(["og:site_name", "publisher"]) |
| 220 | + ); |
| 221 | + |
| 222 | + if (impliedRootClass === "h-card") { |
| 223 | + setMicroformatProp("given-name", metaTags.get(["profile:first_name"])); |
| 224 | + setMicroformatProp("family-name", metaTags.get(["profile:last_name"])); |
| 225 | + } |
| 226 | + |
| 227 | + if (Object.keys(item.properties).length === 0) { |
| 228 | + return []; |
| 229 | + } |
| 230 | + |
| 231 | + return [item]; |
| 232 | +}; |
| 233 | + |
| 234 | +export const parseMetaformats = ( |
| 235 | + doc: Document, |
| 236 | + options: ParsingOptions |
| 237 | +): MicroformatRoot[] => { |
| 238 | + // Per validation, html element will always be found |
| 239 | + const html = doc.childNodes.find(isTag("html")); |
| 240 | + const head = html?.childNodes.find(isTag("head")); |
| 241 | + |
| 242 | + // Per manual testing, head will always be defined |
| 243 | + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion |
| 244 | + const metaContent = collectMetaTags(head!); |
| 245 | + return combineRoot(metaContent, options); |
| 246 | +}; |
0 commit comments