web_reader / build /services /pdf-extract.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.PDFExtractor = void 0;
const tsyringe_1 = require("tsyringe");
const lodash_1 = __importDefault(require("lodash"));
const civkit_1 = require("civkit");
const logger_1 = require("./logger");
const pdf_1 = require("../db/pdf");
const dayjs_1 = __importDefault(require("dayjs"));
const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket");
const crypto_1 = require("crypto");
const path_1 = __importDefault(require("path"));
const async_context_1 = require("./async-context");
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
dayjs_1.default.extend(utc); // Extend dayjs with the UTC plugin
const timezone = require('dayjs/plugin/timezone');
dayjs_1.default.extend(timezone);
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
const nodeCmapUrl = path_1.default.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
const md5Hasher = new civkit_1.HashManager('md5', 'hex');
function stdDev(numbers) {
const mean = lodash_1.default.mean(numbers);
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
const avgSquareDiff = lodash_1.default.mean(squareDiffs);
return Math.sqrt(avgSquareDiff);
}
function isRotatedByAtLeast35Degrees(transform) {
if (!transform) {
return false;
}
const [a, b, c, d, _e, _f] = transform;
// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d
// Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
const rotationAngle1 = Math.abs(angle1);
const rotationAngle2 = Math.abs(angle2);
// Check if the absolute rotation angle is greater than or equal to 35 degrees
return rotationAngle1 >= 35 || rotationAngle2 >= 35;
}
let PDFExtractor = class PDFExtractor extends civkit_1.AsyncService {
constructor(globalLogger, firebaseObjectStorage, asyncLocalContext) {
super(...arguments);
this.globalLogger = globalLogger;
this.firebaseObjectStorage = firebaseObjectStorage;
this.asyncLocalContext = asyncLocalContext;
this.logger = this.globalLogger.child({ service: this.constructor.name });
this.cacheRetentionMs = 1000 * 3600 * 24 * 7;
}
async init() {
await this.dependencyReady();
this.pdfjs = await pPdfjs;
this.emit('ready');
}
isDataUrl(url) {
return url.startsWith('data:');
}
parseDataUrl(url) {
const protocol = url.slice(0, url.indexOf(':'));
const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
const data = url.slice(url.indexOf(',') + 1);
if (protocol !== 'data' || !data) {
throw new Error('Invalid data URL');
}
if (contentType !== 'application/pdf') {
throw new Error('Invalid data URL type');
}
return {
type: contentType,
data: data
};
}
async extract(url) {
let loadingTask;
if (typeof url === 'string' && this.isDataUrl(url)) {
const { data } = this.parseDataUrl(url);
const binary = Uint8Array.from(Buffer.from(data, 'base64'));
loadingTask = this.pdfjs.getDocument({
data: binary,
disableFontFace: true,
verbosity: 0,
cMapUrl: nodeCmapUrl,
});
}
else {
loadingTask = this.pdfjs.getDocument({
url,
disableFontFace: true,
verbosity: 0,
cMapUrl: nodeCmapUrl,
});
}
const doc = await loadingTask.promise;
const meta = await doc.getMetadata();
const textItems = [];
for (const pg of lodash_1.default.range(0, doc.numPages)) {
const page = await doc.getPage(pg + 1);
const textContent = await page.getTextContent({ includeMarkedContent: true });
textItems.push(textContent.items);
}
const articleCharHeights = [];
for (const textItem of textItems.flat()) {
if (textItem.height) {
articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
}
}
const articleAvgHeight = lodash_1.default.mean(articleCharHeights);
const articleStdDevHeight = stdDev(articleCharHeights);
// const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
const mdOps = [];
const rawChunks = [];
let op = 'new';
let mode = 'p';
for (const pageTextItems of textItems) {
const charHeights = [];
for (const textItem of pageTextItems) {
if (textItem.height) {
charHeights.push(...Array(textItem.str.length).fill(textItem.height));
}
rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
}
const avgHeight = lodash_1.default.mean(charHeights);
const stdDevHeight = stdDev(charHeights);
// const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];
for (const textItem of pageTextItems) {
if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
mode = 'h1';
}
else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
mode = 'h2';
}
else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
mode = 'appendix';
}
else if (textItem.height) {
mode = 'p';
}
else {
mode = 'space';
}
if (isRotatedByAtLeast35Degrees(textItem.transform)) {
mode = 'appendix';
}
mdOps.push({
op,
mode,
text: textItem.str
});
if (textItem.hasEOL && !textItem.str) {
op = 'new';
}
else {
op = 'append';
}
}
}
const mdChunks = [];
const appendixChunks = [];
mode = 'space';
for (const x of mdOps) {
const previousMode = mode;
const changeToMdChunks = [];
const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode));
if (isNewStart) {
switch (x.mode) {
case 'h1': {
changeToMdChunks.push(`\n\n# `);
mode = x.mode;
break;
}
case 'h2': {
changeToMdChunks.push(`\n\n## `);
mode = x.mode;
break;
}
case 'p': {
changeToMdChunks.push(`\n\n`);
mode = x.mode;
break;
}
case 'appendix': {
mode = x.mode;
appendixChunks.push(`\n\n`);
break;
}
default: {
break;
}
}
}
else {
if (x.mode === 'appendix' && appendixChunks.length) {
const lastChunk = appendixChunks[appendixChunks.length - 1];
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
appendixChunks.push(' ');
}
}
else if (mdChunks.length) {
const lastChunk = mdChunks[mdChunks.length - 1];
if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
changeToMdChunks.push(' ');
}
}
}
if (x.text) {
if (x.mode == 'appendix') {
if (appendixChunks.length || isNewStart) {
appendixChunks.push(x.text);
}
else {
changeToMdChunks.push(x.text);
}
}
else {
changeToMdChunks.push(x.text);
}
}
if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
changeToMdChunks.unshift(appendix);
changeToMdChunks.unshift(`\n\n`);
appendixChunks.length = 0;
}
if (x.mode === 'space' && changeToMdChunks.length) {
changeToMdChunks.length = 1;
}
if (changeToMdChunks.length) {
mdChunks.push(...changeToMdChunks);
}
}
if (mdChunks.length) {
mdChunks[0] = mdChunks[0].trimStart();
}
return { meta: meta.info, content: mdChunks.join(''), text: rawChunks.join('') };
}
async cachedExtract(url, cacheTolerance = 1000 * 3600 * 24, alternativeUrl) {
if (!url) {
return undefined;
}
let nameUrl = alternativeUrl || url;
const digest = md5Hasher.hash(nameUrl);
if (this.isDataUrl(url)) {
nameUrl = `blob://pdf:${digest}`;
}
const cache = nameUrl.startsWith('blob:') ? undefined :
(await pdf_1.PDFContent.fromFirestoreQuery(pdf_1.PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) {
const age = Date.now() - cache?.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
data: url, url: nameUrl, digest, age, stale, cacheTolerance
});
if (!stale) {
if (cache.content && cache.text) {
return {
meta: cache.meta,
content: cache.content,
text: cache.text
};
}
try {
const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`);
let cached = JSON.parse(r.toString('utf-8'));
return {
meta: cached.meta,
content: cached.content,
text: cached.text
};
}
catch (err) {
this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
return undefined;
}
}
}
let extracted;
try {
extracted = await this.extract(url);
}
catch (err) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
throw new civkit_1.AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
}
if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
const theID = (0, crypto_1.randomUUID)();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
pdf_1.PDFContent.save(pdf_1.PDFContent.from({
_id: theID,
src: nameUrl,
meta: extracted?.meta || {},
urlDigest: digest,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.cacheRetentionMs)
}).degradeForFireStore()).catch((r) => {
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
});
}
return extracted;
}
parsePdfDate(pdfDate) {
if (!pdfDate) {
return undefined;
}
// Remove the 'D:' prefix
const cleanedDate = pdfDate.slice(2);
// Define the format without the timezone part first
const dateTimePart = cleanedDate.slice(0, 14);
const timezonePart = cleanedDate.slice(14);
// Construct the full date string in a standard format
const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;
// Parse the date with timezone
const parsedDate = (0, dayjs_1.default)(formattedDate, "YYYYMMDDHHmmssZ");
const date = parsedDate.toDate();
if (!date.valueOf()) {
return undefined;
}
return date;
}
};
exports.PDFExtractor = PDFExtractor;
exports.PDFExtractor = PDFExtractor = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
firebase_storage_bucket_1.FirebaseStorageBucketControl,
async_context_1.AsyncLocalContext])
], PDFExtractor);
//# sourceMappingURL=pdf-extract.js.map