Spaces:
Build error
Build error
| import { singleton } from 'tsyringe'; | |
| import _ from 'lodash'; | |
| import { TextItem } from 'pdfjs-dist/types/src/display/api'; | |
| import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; | |
| import { GlobalLogger } from './logger'; | |
| import { PDFContent } from '../db/pdf'; | |
| import dayjs from 'dayjs'; | |
| import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; | |
| import { randomUUID } from 'crypto'; | |
| import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; | |
| import path from 'path'; | |
| import { AsyncLocalContext } from './async-context'; | |
| const utc = require('dayjs/plugin/utc'); // Import the UTC plugin | |
| dayjs.extend(utc); // Extend dayjs with the UTC plugin | |
| const timezone = require('dayjs/plugin/timezone'); | |
| dayjs.extend(timezone); | |
| const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs'); | |
| const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/'; | |
| const md5Hasher = new HashManager('md5', 'hex'); | |
| function stdDev(numbers: number[]) { | |
| const mean = _.mean(numbers); | |
| const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2)); | |
| const avgSquareDiff = _.mean(squareDiffs); | |
| return Math.sqrt(avgSquareDiff); | |
| } | |
| function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean { | |
| if (!transform) { | |
| return false; | |
| } | |
| const [a, b, c, d, _e, _f] = transform; | |
| // Calculate the rotation angles using arctan(b/a) and arctan(-c/d) | |
| const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b | |
| const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d | |
| // Either angle1 or angle2 can be used to determine the rotation, they should be equivalent | |
| const rotationAngle1 = Math.abs(angle1); | |
| const rotationAngle2 = Math.abs(angle2); | |
| // Check if the absolute rotation angle is greater than or equal to 35 degrees | |
| return rotationAngle1 >= 35 || rotationAngle2 >= 35; | |
| } | |
| () | |
| export class PDFExtractor extends AsyncService { | |
| logger = this.globalLogger.child({ service: this.constructor.name }); | |
| pdfjs!: Awaited<typeof pPdfjs>; | |
| cacheRetentionMs = 1000 * 3600 * 24 * 7; | |
| constructor( | |
| protected globalLogger: GlobalLogger, | |
| protected firebaseObjectStorage: FirebaseStorageBucketControl, | |
| protected asyncLocalContext: AsyncLocalContext, | |
| ) { | |
| super(...arguments); | |
| } | |
| override async init() { | |
| await this.dependencyReady(); | |
| this.pdfjs = await pPdfjs; | |
| this.emit('ready'); | |
| } | |
| isDataUrl(url: string) { | |
| return url.startsWith('data:'); | |
| } | |
| parseDataUrl(url: string) { | |
| const protocol = url.slice(0, url.indexOf(':')); | |
| const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';')); | |
| const data = url.slice(url.indexOf(',') + 1); | |
| if (protocol !== 'data' || !data) { | |
| throw new Error('Invalid data URL'); | |
| } | |
| if (contentType !== 'application/pdf') { | |
| throw new Error('Invalid data URL type'); | |
| } | |
| return { | |
| type: contentType, | |
| data: data | |
| }; | |
| } | |
| async extract(url: string | URL) { | |
| let loadingTask: PDFDocumentLoadingTask; | |
| if (typeof url === 'string' && this.isDataUrl(url)) { | |
| const { data } = this.parseDataUrl(url); | |
| const binary = Uint8Array.from(Buffer.from(data, 'base64')); | |
| loadingTask = this.pdfjs.getDocument({ | |
| data: binary, | |
| disableFontFace: true, | |
| verbosity: 0, | |
| cMapUrl: nodeCmapUrl, | |
| }); | |
| } else { | |
| loadingTask = this.pdfjs.getDocument({ | |
| url, | |
| disableFontFace: true, | |
| verbosity: 0, | |
| cMapUrl: nodeCmapUrl, | |
| }); | |
| } | |
| const doc = await loadingTask.promise; | |
| const meta = await doc.getMetadata(); | |
| const textItems: TextItem[][] = []; | |
| for (const pg of _.range(0, doc.numPages)) { | |
| const page = await doc.getPage(pg + 1); | |
| const textContent = await page.getTextContent({ includeMarkedContent: true }); | |
| textItems.push((textContent.items as TextItem[])); | |
| } | |
| const articleCharHeights: number[] = []; | |
| for (const textItem of textItems.flat()) { | |
| if (textItem.height) { | |
| articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height)); | |
| } | |
| } | |
| const articleAvgHeight = _.mean(articleCharHeights); | |
| const articleStdDevHeight = stdDev(articleCharHeights); | |
| // const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)]; | |
| const mdOps: Array<{ | |
| text: string; | |
| op?: 'new' | 'append'; | |
| mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space'; | |
| }> = []; | |
| const rawChunks: string[] = []; | |
| let op: 'append' | 'new' = 'new'; | |
| let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p'; | |
| for (const pageTextItems of textItems) { | |
| const charHeights = []; | |
| for (const textItem of pageTextItems as TextItem[]) { | |
| if (textItem.height) { | |
| charHeights.push(...Array(textItem.str.length).fill(textItem.height)); | |
| } | |
| rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`); | |
| } | |
| const avgHeight = _.mean(charHeights); | |
| const stdDevHeight = stdDev(charHeights); | |
| // const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)]; | |
| for (const textItem of pageTextItems) { | |
| if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) { | |
| mode = 'h1'; | |
| } else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) { | |
| mode = 'h2'; | |
| } else if (textItem.height && textItem.height < avgHeight - stdDevHeight) { | |
| mode = 'appendix'; | |
| } else if (textItem.height) { | |
| mode = 'p'; | |
| } else { | |
| mode = 'space'; | |
| } | |
| if (isRotatedByAtLeast35Degrees(textItem.transform as any)) { | |
| mode = 'appendix'; | |
| } | |
| mdOps.push({ | |
| op, | |
| mode, | |
| text: textItem.str | |
| }); | |
| if (textItem.hasEOL && !textItem.str) { | |
| op = 'new'; | |
| } else { | |
| op = 'append'; | |
| } | |
| } | |
| } | |
| const mdChunks = []; | |
| const appendixChunks = []; | |
| mode = 'space'; | |
| for (const x of mdOps) { | |
| const previousMode: string = mode; | |
| const changeToMdChunks = []; | |
| const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode)); | |
| if (isNewStart) { | |
| switch (x.mode) { | |
| case 'h1': { | |
| changeToMdChunks.push(`\n\n# `); | |
| mode = x.mode; | |
| break; | |
| } | |
| case 'h2': { | |
| changeToMdChunks.push(`\n\n## `); | |
| mode = x.mode; | |
| break; | |
| } | |
| case 'p': { | |
| changeToMdChunks.push(`\n\n`); | |
| mode = x.mode; | |
| break; | |
| } | |
| case 'appendix': { | |
| mode = x.mode; | |
| appendixChunks.push(`\n\n`); | |
| break; | |
| } | |
| default: { | |
| break; | |
| } | |
| } | |
| } else { | |
| if (x.mode === 'appendix' && appendixChunks.length) { | |
| const lastChunk = appendixChunks[appendixChunks.length - 1]; | |
| if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) { | |
| appendixChunks.push(' '); | |
| } | |
| } else if (mdChunks.length) { | |
| const lastChunk = mdChunks[mdChunks.length - 1]; | |
| if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) { | |
| changeToMdChunks.push(' '); | |
| } | |
| } | |
| } | |
| if (x.text) { | |
| if (x.mode == 'appendix') { | |
| if (appendixChunks.length || isNewStart) { | |
| appendixChunks.push(x.text); | |
| } else { | |
| changeToMdChunks.push(x.text); | |
| } | |
| } else { | |
| changeToMdChunks.push(x.text); | |
| } | |
| } | |
| if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) { | |
| const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n'); | |
| changeToMdChunks.unshift(appendix); | |
| changeToMdChunks.unshift(`\n\n`); | |
| appendixChunks.length = 0; | |
| } | |
| if (x.mode === 'space' && changeToMdChunks.length) { | |
| changeToMdChunks.length = 1; | |
| } | |
| if (changeToMdChunks.length) { | |
| mdChunks.push(...changeToMdChunks); | |
| } | |
| } | |
| if (mdChunks.length) { | |
| mdChunks[0] = mdChunks[0].trimStart(); | |
| } | |
| return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') }; | |
| } | |
| async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { | |
| if (!url) { | |
| return undefined; | |
| } | |
| let nameUrl = alternativeUrl || url; | |
| const digest = md5Hasher.hash(nameUrl); | |
| if (this.isDataUrl(url)) { | |
| nameUrl = `blob://pdf:${digest}`; | |
| } | |
| const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined : | |
| (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; | |
| if (cache) { | |
| const age = Date.now() - cache?.createdAt.valueOf(); | |
| const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); | |
| this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { | |
| data: url, url: nameUrl, digest, age, stale, cacheTolerance | |
| }); | |
| if (!stale) { | |
| if (cache.content && cache.text) { | |
| return { | |
| meta: cache.meta, | |
| content: cache.content, | |
| text: cache.text | |
| }; | |
| } | |
| try { | |
| const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`); | |
| let cached = JSON.parse(r.toString('utf-8')); | |
| return { | |
| meta: cached.meta, | |
| content: cached.content, | |
| text: cached.text | |
| }; | |
| } catch (err) { | |
| this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err }); | |
| return undefined; | |
| } | |
| } | |
| } | |
| let extracted; | |
| try { | |
| extracted = await this.extract(url); | |
| } catch (err: any) { | |
| this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl }); | |
| throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`); | |
| } | |
| if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) { | |
| const theID = randomUUID(); | |
| await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, | |
| Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); | |
| PDFContent.save( | |
| PDFContent.from({ | |
| _id: theID, | |
| src: nameUrl, | |
| meta: extracted?.meta || {}, | |
| urlDigest: digest, | |
| createdAt: new Date(), | |
| expireAt: new Date(Date.now() + this.cacheRetentionMs) | |
| }).degradeForFireStore() | |
| ).catch((r) => { | |
| this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); | |
| }); | |
| } | |
| return extracted; | |
| } | |
| parsePdfDate(pdfDate: string | undefined) { | |
| if (!pdfDate) { | |
| return undefined; | |
| } | |
| // Remove the 'D:' prefix | |
| const cleanedDate = pdfDate.slice(2); | |
| // Define the format without the timezone part first | |
| const dateTimePart = cleanedDate.slice(0, 14); | |
| const timezonePart = cleanedDate.slice(14); | |
| // Construct the full date string in a standard format | |
| const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`; | |
| // Parse the date with timezone | |
| const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ"); | |
| const date = parsedDate.toDate(); | |
| if (!date.valueOf()) { | |
| return undefined; | |
| } | |
| return date; | |
| } | |
| } | |