Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

web_reader / src /services /pdf-extract.ts

nomagick

fix: removing core-js pollution

61a1252 unverified about 1 year ago

raw

history blame contribute delete

13.6 kB

	import { singleton } from 'tsyringe';
	import _ from 'lodash';
	import { TextItem } from 'pdfjs-dist/types/src/display/api';
	import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
	import { GlobalLogger } from './logger';
	import { PDFContent } from '../db/pdf';
	import dayjs from 'dayjs';
	import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
	import { randomUUID } from 'crypto';
	import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
	import path from 'path';
	import { AsyncLocalContext } from './async-context';
	const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
	dayjs.extend(utc); // Extend dayjs with the UTC plugin
	const timezone = require('dayjs/plugin/timezone');
	dayjs.extend(timezone);

	const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
	const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';

	const md5Hasher = new HashManager('md5', 'hex');

	function stdDev(numbers: number[]) {
	const mean = _.mean(numbers);
	const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
	const avgSquareDiff = _.mean(squareDiffs);
	return Math.sqrt(avgSquareDiff);
	}

	function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean {
	if (!transform) {
	return false;
	}
	const [a, b, c, d, _e, _f] = transform;

	// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
	const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
	const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d

	// Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
	const rotationAngle1 = Math.abs(angle1);
	const rotationAngle2 = Math.abs(angle2);

	// Check if the absolute rotation angle is greater than or equal to 35 degrees
	return rotationAngle1 >= 35 \|\| rotationAngle2 >= 35;
	}

	@singleton()
	export class PDFExtractor extends AsyncService {

	logger = this.globalLogger.child({ service: this.constructor.name });
	pdfjs!: Awaited<typeof pPdfjs>;

	cacheRetentionMs = 1000 * 3600 * 24 * 7;

	constructor(
	protected globalLogger: GlobalLogger,
	protected firebaseObjectStorage: FirebaseStorageBucketControl,
	protected asyncLocalContext: AsyncLocalContext,
	) {
	super(...arguments);
	}

	override async init() {
	await this.dependencyReady();
	this.pdfjs = await pPdfjs;

	this.emit('ready');
	}

	isDataUrl(url: string) {
	return url.startsWith('data:');
	}

	parseDataUrl(url: string) {
	const protocol = url.slice(0, url.indexOf(':'));
	const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
	const data = url.slice(url.indexOf(',') + 1);
	if (protocol !== 'data' \|\| !data) {
	throw new Error('Invalid data URL');
	}

	if (contentType !== 'application/pdf') {
	throw new Error('Invalid data URL type');
	}

	return {
	type: contentType,
	data: data
	};
	}

	async extract(url: string \| URL) {
	let loadingTask: PDFDocumentLoadingTask;

	if (typeof url === 'string' && this.isDataUrl(url)) {
	const { data } = this.parseDataUrl(url);
	const binary = Uint8Array.from(Buffer.from(data, 'base64'));
	loadingTask = this.pdfjs.getDocument({
	data: binary,
	disableFontFace: true,
	verbosity: 0,
	cMapUrl: nodeCmapUrl,
	});
	} else {
	loadingTask = this.pdfjs.getDocument({
	url,
	disableFontFace: true,
	verbosity: 0,
	cMapUrl: nodeCmapUrl,
	});
	}


	const doc = await loadingTask.promise;
	const meta = await doc.getMetadata();

	const textItems: TextItem[][] = [];

	for (const pg of _.range(0, doc.numPages)) {
	const page = await doc.getPage(pg + 1);
	const textContent = await page.getTextContent({ includeMarkedContent: true });
	textItems.push((textContent.items as TextItem[]));
	}

	const articleCharHeights: number[] = [];
	for (const textItem of textItems.flat()) {
	if (textItem.height) {
	articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
	}
	}
	const articleAvgHeight = _.mean(articleCharHeights);
	const articleStdDevHeight = stdDev(articleCharHeights);
	// const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
	const mdOps: Array<{
	text: string;
	op?: 'new' \| 'append';
	mode: 'h1' \| 'h2' \| 'p' \| 'appendix' \| 'space';
	}> = [];

	const rawChunks: string[] = [];

	let op: 'append' \| 'new' = 'new';
	let mode: 'h1' \| 'h2' \| 'p' \| 'space' \| 'appendix' = 'p';
	for (const pageTextItems of textItems) {
	const charHeights = [];
	for (const textItem of pageTextItems as TextItem[]) {
	if (textItem.height) {
	charHeights.push(...Array(textItem.str.length).fill(textItem.height));
	}
	rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
	}

	const avgHeight = _.mean(charHeights);
	const stdDevHeight = stdDev(charHeights);
	// const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];

	for (const textItem of pageTextItems) {
	if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
	mode = 'h1';
	} else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
	mode = 'h2';
	} else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
	mode = 'appendix';
	} else if (textItem.height) {
	mode = 'p';
	} else {
	mode = 'space';
	}

	if (isRotatedByAtLeast35Degrees(textItem.transform as any)) {
	mode = 'appendix';
	}

	mdOps.push({
	op,
	mode,
	text: textItem.str
	});

	if (textItem.hasEOL && !textItem.str) {
	op = 'new';
	} else {
	op = 'append';
	}
	}
	}

	const mdChunks = [];
	const appendixChunks = [];
	mode = 'space';
	for (const x of mdOps) {
	const previousMode: string = mode;
	const changeToMdChunks = [];

	const isNewStart = x.mode !== 'space' && (x.op === 'new' \|\| (previousMode === 'appendix' && x.mode !== previousMode));

	if (isNewStart) {
	switch (x.mode) {
	case 'h1': {
	changeToMdChunks.push(`\n\n# `);
	mode = x.mode;
	break;
	}

	case 'h2': {
	changeToMdChunks.push(`\n\n## `);
	mode = x.mode;
	break;
	}

	case 'p': {
	changeToMdChunks.push(`\n\n`);
	mode = x.mode;
	break;
	}

	case 'appendix': {
	mode = x.mode;
	appendixChunks.push(`\n\n`);
	break;
	}

	default: {
	break;
	}
	}
	} else {
	if (x.mode === 'appendix' && appendixChunks.length) {
	const lastChunk = appendixChunks[appendixChunks.length - 1];
	if (!lastChunk.match(/(\s+\|-)$/) && lastChunk.length !== 1) {
	appendixChunks.push(' ');
	}
	} else if (mdChunks.length) {
	const lastChunk = mdChunks[mdChunks.length - 1];
	if (!lastChunk.match(/(\s+\|-)$/) && lastChunk.length !== 1) {
	changeToMdChunks.push(' ');
	}
	}
	}

	if (x.text) {
	if (x.mode == 'appendix') {
	if (appendixChunks.length \|\| isNewStart) {
	appendixChunks.push(x.text);
	} else {
	changeToMdChunks.push(x.text);
	}
	} else {
	changeToMdChunks.push(x.text);
	}
	}

	if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
	const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
	changeToMdChunks.unshift(appendix);
	changeToMdChunks.unshift(`\n\n`);
	appendixChunks.length = 0;
	}

	if (x.mode === 'space' && changeToMdChunks.length) {
	changeToMdChunks.length = 1;
	}
	if (changeToMdChunks.length) {
	mdChunks.push(...changeToMdChunks);
	}
	}

	if (mdChunks.length) {
	mdChunks[0] = mdChunks[0].trimStart();
	}

	return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
	}

	async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
	if (!url) {
	return undefined;
	}
	let nameUrl = alternativeUrl \|\| url;
	const digest = md5Hasher.hash(nameUrl);

	if (this.isDataUrl(url)) {
	nameUrl = `blob://pdf:${digest}`;
	}

	const cache: PDFContent \| undefined = nameUrl.startsWith('blob:') ? undefined :
	(await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];

	if (cache) {
	const age = Date.now() - cache?.createdAt.valueOf();
	const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
	this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
	data: url, url: nameUrl, digest, age, stale, cacheTolerance
	});

	if (!stale) {
	if (cache.content && cache.text) {
	return {
	meta: cache.meta,
	content: cache.content,
	text: cache.text
	};
	}

	try {
	const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`);
	let cached = JSON.parse(r.toString('utf-8'));

	return {
	meta: cached.meta,
	content: cached.content,
	text: cached.text
	};
	} catch (err) {
	this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });

	return undefined;
	}
	}
	}

	let extracted;

	try {
	extracted = await this.extract(url);
	} catch (err: any) {
	this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
	throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
	}

	if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
	const theID = randomUUID();
	await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
	Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
	PDFContent.save(
	PDFContent.from({
	_id: theID,
	src: nameUrl,
	meta: extracted?.meta \|\| {},
	urlDigest: digest,
	createdAt: new Date(),
	expireAt: new Date(Date.now() + this.cacheRetentionMs)
	}).degradeForFireStore()
	).catch((r) => {
	this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
	});
	}

	return extracted;
	}

	parsePdfDate(pdfDate: string \| undefined) {
	if (!pdfDate) {
	return undefined;
	}
	// Remove the 'D:' prefix
	const cleanedDate = pdfDate.slice(2);

	// Define the format without the timezone part first
	const dateTimePart = cleanedDate.slice(0, 14);
	const timezonePart = cleanedDate.slice(14);

	// Construct the full date string in a standard format
	const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;

	// Parse the date with timezone
	const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ");

	const date = parsedDate.toDate();

	if (!date.valueOf()) {
	return undefined;
	}

	return date;
	}
	}