Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit handling of images processing and other fixes #2143

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Unreleased:
* FIX: Ensure AWS SDK has access to object size when issuing an upload (@benoit #2117)
* FIX: Change log level of S3 missing keys message (@benoit #2144)
* FIX: Logic to set .webp path prefix on reencoded images is skewed (@benoit74 #2140)
* FIX: S3 cached images are missing (@benoit74 #2136)
* FIX: Do not rely on URL filename extension to detect images (@benoit74 #2088)
* FIX: S3 cached image are never used (@benoit74 #2138)

1.14.0:
* FIX: Remove S3 upload concurrency to avoid 'RequestTimeTooSkewed' errors (@benoi74 #2118)
Expand Down
50 changes: 2 additions & 48 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@
"imagemin-webp": "^7.0.0",
"md5": "^2.3.0",
"merge": "^2.1.1",
"mime-type": "^4.0.0",
"mkdirp": "^2.1.6",
"mocha": "^10.2.0",
"p-map": "^5.5.0",
Expand Down
147 changes: 81 additions & 66 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
import sharp from 'sharp'
import http from 'http'
import https from 'https'
import { fileTypeFromBuffer } from 'file-type'

import { normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBitmapImageMimeType, isImageUrl, getMimeType, isWebpCandidateImageMimeType } from './util/index.js'
import { normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBitmapImageMimeType, isWebpCandidateImageMimeType } from './util/index.js'
import S3 from './S3.js'
import * as logger from './Logger.js'
import MediaWiki, { QueryOpts } from './MediaWiki.js'
Expand Down Expand Up @@ -68,6 +69,10 @@
backoffHandler: (number: number, delay: number, error?: any) => void
}

interface CompressionData {
data: any
}

export const defaultStreamRequestOptions: AxiosRequestConfig = {
headers: {
accept: 'application/octet-stream',
Expand Down Expand Up @@ -353,7 +358,7 @@
const url = urlHelper.deserializeUrl(_url)
await this.claimRequest()
return new Promise<T>((resolve, reject) => {
this.backoffCall(this.getJSONCb, url, (err: any, val: any) => {
this.backoffCall(this.getJSONCb, url, 'json', (err: any, val: any) => {
this.releaseRequest()
if (err) {
const httpStatus = err.response && err.response.status
Expand All @@ -366,7 +371,7 @@
})
}

public async downloadContent(_url: string, retry = true): Promise<{ content: Buffer | string; responseHeaders: any }> {
public async downloadContent(_url: string, kind: string, retry = true): Promise<{ content: Buffer | string; contentType: string; setCookie: string | null }> {
if (!_url) {
throw new Error(`Parameter [${_url}] is not a valid url`)
}
Expand All @@ -384,9 +389,9 @@
}
}
if (retry) {
this.backoffCall(this.getContentCb, url, cb)
this.backoffCall(this.getContentCb, url, kind, cb)
} else {
this.getContentCb(url, cb)
this.getContentCb(url, kind, cb)
}
})
} catch (err) {
Expand Down Expand Up @@ -454,7 +459,7 @@
return null
}

private getJSONCb = <T>(url: string, handler: (...args: any[]) => any): void => {
private getJSONCb = <T>(url: string, kind: string, handler: (...args: any[]) => any): void => {
logger.info(`Getting JSON from [${url}]`)
axios
.get<T>(url, this.jsonRequestOptions)
Expand All @@ -466,7 +471,7 @@
const newMaxActiveRequests: number = Math.max(this.maxActiveRequests - 1, 1)
logger.log(`Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`)
this.maxActiveRequests = newMaxActiveRequests
return this.getJSONCb(url, handler)
return this.getJSONCb(url, kind, handler)

Check warning on line 474 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L474

Added line #L474 was not covered by tests
} else if (err.response && err.response.status === 404) {
handler(err)
}
Expand All @@ -477,56 +482,68 @@
})
}

private async getCompressedBody(resp: any): Promise<any> {
if (isBitmapImageMimeType(resp.headers['content-type'])) {
if (isWebpCandidateImageMimeType(this.webp, resp.headers['content-type']) && !this.cssDependenceUrls.hasOwnProperty(resp.config.url)) {
resp.data = await (imagemin as any)
.buffer(resp.data, imageminOptions.get('webp').get(resp.headers['content-type']))
.catch(async (err) => {
if (/Unsupported color conversion request/.test(err.stderr)) {
return (imagemin as any)
.buffer(await sharp(resp.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(resp.headers['content-type']))
.catch(() => {
return resp.data
})
.then((data) => {
resp.headers['content-type'] = 'image/webp'
return data
private async getImageMimeType(data: any): Promise<string | null> {
const fileType = await fileTypeFromBuffer(data)
return fileType ? fileType.mime : null
}

private async getCompressedBody(input: CompressionData): Promise<CompressionData> {
const contentType = await this.getImageMimeType(input.data)
if (isBitmapImageMimeType(contentType)) {
if (this.webp && isWebpCandidateImageMimeType(contentType)) {
return {

Check warning on line 494 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L494

Added line #L494 was not covered by tests
data: await (imagemin as any)
.buffer(input.data, imageminOptions.get('webp').get(contentType))
.catch(async (err) => {

Check warning on line 497 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L497

Added line #L497 was not covered by tests
if (/Unsupported color conversion request/.test(err.stderr)) {
return (imagemin as any)

Check warning on line 499 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L499

Added line #L499 was not covered by tests
.buffer(await sharp(input.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(contentType))
.catch(() => {
return input.data

Check warning on line 502 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L501-L502

Added lines #L501 - L502 were not covered by tests
})
.then((data) => {
return data

Check warning on line 505 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L504-L505

Added lines #L504 - L505 were not covered by tests
})
} else {
return (imagemin as any).buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => {
return input.data

Check warning on line 509 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L507-L509

Added lines #L507 - L509 were not covered by tests
})
} else {
return (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => {
return resp.data
})
}
})
.then((data) => {
resp.headers['content-type'] = 'image/webp'
return data
})
resp.headers.path_postfix = '.webp'
}
})
.then((data) => {
return data

Check warning on line 514 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L513-L514

Added lines #L513 - L514 were not covered by tests
}),
}
} else {
resp.data = await (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => {
return resp.data
})
return {
data: await (imagemin as any).buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => {
return input.data

Check warning on line 520 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L519-L520

Added lines #L519 - L520 were not covered by tests
}),
}
}
return true
}
return false
return {
data: input.data,
}
}

private getContentCb = async (url: string, handler: any): Promise<void> => {
private getContentCb = async (url: string, kind: string, handler: any): Promise<void> => {
logger.info(`Downloading [${url}]`)
try {
if (this.optimisationCacheUrl && isImageUrl(url)) {
if (this.optimisationCacheUrl && kind === 'image') {
this.downloadImage(url, handler)
} else {
// Use the base domain of the wiki being scraped as the Referer header, so that we can
// successfully scrap WMF map tiles.
const resp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { Referer: MediaWiki.baseUrl.href } })
await this.getCompressedBody(resp)
const resp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { ...this.arrayBufferRequestOptions.headers, Referer: MediaWiki.baseUrl.href } })
// If content is an image, we might benefit from compressing it
const content = kind === 'image' ? (await this.getCompressedBody({ data: resp.data })).data : resp.data
// compute content-type from content, since getCompressedBody might have modified it
const contentType = kind === 'image' ? (await this.getImageMimeType(content)) || resp.headers['content-type'] : resp.headers['content-type']
handler(null, {
responseHeaders: resp.headers,
content: resp.data,
contentType,
content,
setCookie: resp.headers['set-cookie'] ? resp.headers['set-cookie'].join(';') : null,
})
}
} catch (err) {
Expand Down Expand Up @@ -555,46 +572,44 @@
}
// Use the base domain of the wiki being scraped as the Referer header, so that we can
// successfully scrap WMF map tiles.
const mwResp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { Referer: MediaWiki.baseUrl.href } })

// HTTP response content-type can not really be trusted (at least if 304)
mwResp.headers['content-type'] = getMimeType(url, s3Resp?.Metadata?.contenttype || mwResp.headers['content-type'])
const mwResp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { ...this.arrayBufferRequestOptions.headers, Referer: MediaWiki.baseUrl.href } })

// Most of the images, after having been uploaded once to the
// cache, will always have 304 status, until modified. If cache
// is up to date, return cached image.
if (mwResp.status === 304) {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const headers = (({ Body, ...o }) => o)(s3Resp)

// If image is a webp conversion candidate
if (isWebpCandidateImageMimeType(this.webp, mwResp.headers['content-type']) && !this.cssDependenceUrls.hasOwnProperty(mwResp.config.url)) {
headers.path_postfix = '.webp'
headers['content-type'] = 'image/webp'
}

// Proceed with image
const data = (await this.streamToBuffer(s3Resp.Body as Readable)) as any
const contentType = await this.getImageMimeType(data)
logger.info(`Using S3-cached image for ${url} (contentType: ${contentType})`)
handler(null, {
responseHeaders: headers,
content: (await this.streamToBuffer(s3Resp.Body as Readable)) as any,
contentType,
content: data,
})

return
}

// Compress content because image blob comes from upstream MediaWiki
await this.getCompressedBody(mwResp)
const compressedData = (await this.getCompressedBody({ data: mwResp.data })).data

Check warning on line 593 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L593

Added line #L593 was not covered by tests

// Check for the ETag and upload to cache
const etag = this.removeEtagWeakPrefix(mwResp.headers.etag)
if (etag) {
await this.s3.uploadBlob(stripHttpFromUrl(url), mwResp.data, etag, mwResp.headers['content-type'], this.webp ? 'webp' : '1')
await this.s3.uploadBlob(stripHttpFromUrl(url), compressedData, etag, this.webp ? 'webp' : '1')
}

// get contentType from image, with fallback to response headers should the image be unsupported at all (e.g. SVG)
const contentType = (await this.getImageMimeType(compressedData)) || mwResp.headers['content-type']
if (s3Resp) {
logger.info(`Using image downloaded from upstream for ${url} (S3-cached image is outdated, contentType: ${contentType})`)
} else {
logger.info(`Using image downloaded from upstream for ${url} (no S3-cached image found, contentType: ${contentType})`)

Check warning on line 606 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L604-L606

Added lines #L604 - L606 were not covered by tests
}

// Proceed with image
handler(null, {
responseHeaders: mwResp.headers,
content: mwResp.data,
contentType,
content: compressedData,
})
})
.catch((err) => {
Expand Down Expand Up @@ -630,8 +645,8 @@
}
}

private backoffCall(handler: (...args: any[]) => void, url: string, callback: (...args: any[]) => void | Promise<void>): void {
const call = backoff.call(handler, url, callback)
private backoffCall(handler: (...args: any[]) => void, url: string, kind: string, callback: (...args: any[]) => void | Promise<void>): void {
const call = backoff.call(handler, url, kind, callback)
call.setStrategy(this.backoffOptions.strategy)
call.retryIf(this.backoffOptions.retryIf)
call.failAfter(this.backoffOptions.failAfter)
Expand Down
2 changes: 1 addition & 1 deletion src/Dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ export class Dump {
const sheetUrls: Array<string | DominoElement> = []

/* Load main page to see which CSS files are needed */
const { content } = await downloader.downloadContent(this.mwMetaData.webUrl)
const { content } = await downloader.downloadContent(this.mwMetaData.webUrl, 'data')
const html = content.toString()
const doc = domino.createDocument(html)
const links = Array.from(doc.getElementsByTagName('link'))
Expand Down
Loading
Loading