| const {
|
| Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell,
|
| HeadingLevel, AlignmentType, BorderStyle, WidthType, ShadingType,
|
| TableOfContents, PageBreak, LevelFormat, UnderlineType
|
| } = require('docx');
|
| const fs = require('fs');
|
|
|
|
|
| const C = {
|
| navy: "1B3A5C",
|
| blue: "2E75B6",
|
| lightBlue: "D6E4F0",
|
| teal: "1A7A6E",
|
| lightTeal: "D4EFEC",
|
| amber: "C45911",
|
| lightAmber:"FCE9D9",
|
| purple: "5B3A8C",
|
| lightPurple:"E8DEFF",
|
| gray: "595959",
|
| lightGray: "F2F2F2",
|
| midGray: "D9D9D9",
|
| white: "FFFFFF",
|
| black: "000000",
|
| };
|
|
|
|
|
| const border = (color = C.midGray) => ({ style: BorderStyle.SINGLE, size: 1, color });
|
| const borders = (color = C.midGray) => ({ top: border(color), bottom: border(color), left: border(color), right: border(color) });
|
| const noBorder = () => ({ style: BorderStyle.NONE, size: 0, color: C.white });
|
| const noBorders = () => ({ top: noBorder(), bottom: noBorder(), left: noBorder(), right: noBorder() });
|
|
|
| function cell(text, opts = {}) {
|
| const {
|
| bold = false, color = C.black, bg = C.white, width = 4680,
|
| italic = false, size = 20, align = AlignmentType.LEFT, shade = true
|
| } = opts;
|
| return new TableCell({
|
| borders: borders(C.midGray),
|
| width: { size: width, type: WidthType.DXA },
|
| shading: shade ? { fill: bg, type: ShadingType.CLEAR } : undefined,
|
| margins: { top: 80, bottom: 80, left: 140, right: 140 },
|
| children: [new Paragraph({
|
| alignment: align,
|
| children: [new TextRun({ text, bold, color, italics: italic, size, font: "Arial" })]
|
| })]
|
| });
|
| }
|
|
|
| function hCell(text, bg = C.navy, textColor = C.white, width = 4680) {
|
| return cell(text, { bold: true, color: textColor, bg, width, size: 20 });
|
| }
|
|
|
| function p(runs, opts = {}) {
|
| const { spacing = { before: 80, after: 80 }, align } = opts;
|
| return new Paragraph({
|
| alignment: align,
|
| spacing,
|
| children: Array.isArray(runs) ? runs : [runs],
|
| });
|
| }
|
|
|
| function t(text, opts = {}) {
|
| const { bold = false, italic = false, color = C.black, size = 22, underline } = opts;
|
| return new TextRun({ text, bold, italics: italic, color, size, font: "Arial", underline });
|
| }
|
|
|
| function h1(text) {
|
| return new Paragraph({
|
| heading: HeadingLevel.HEADING_1,
|
| spacing: { before: 360, after: 160 },
|
| children: [new TextRun({ text, bold: true, color: C.navy, size: 36, font: "Arial" })]
|
| });
|
| }
|
|
|
| function h2(text) {
|
| return new Paragraph({
|
| heading: HeadingLevel.HEADING_2,
|
| spacing: { before: 280, after: 120 },
|
| children: [new TextRun({ text, bold: true, color: C.blue, size: 28, font: "Arial" })]
|
| });
|
| }
|
|
|
| function h3(text) {
|
| return new Paragraph({
|
| heading: HeadingLevel.HEADING_3,
|
| spacing: { before: 200, after: 80 },
|
| children: [new TextRun({ text, bold: true, color: C.teal, size: 24, font: "Arial" })]
|
| });
|
| }
|
|
|
| function bullet(text, level = 0, color = C.black) {
|
| return new Paragraph({
|
| numbering: { reference: "bullets", level },
|
| spacing: { before: 40, after: 40 },
|
| children: [new TextRun({ text, color, size: 22, font: "Arial" })]
|
| });
|
| }
|
|
|
| function callout(lines, type = "note") {
|
| const configs = {
|
| note: { bg: C.lightBlue, border: C.blue, label: "π NOTE", labelColor: C.blue },
|
| tip: { bg: C.lightTeal, border: C.teal, label: "β
KEY IDEA", labelColor: C.teal },
|
| warning: { bg: C.lightAmber, border: C.amber, label: "β οΈ WATCH OUT", labelColor: C.amber },
|
| concept: { bg: C.lightPurple, border: C.purple, label: "π§ CONCEPT", labelColor: C.purple },
|
| };
|
| const cfg = configs[type];
|
| return new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [9360],
|
| rows: [
|
| new TableRow({ children: [new TableCell({
|
| borders: { top: border(cfg.border), bottom: border(cfg.border), left: { style: BorderStyle.SINGLE, size: 12, color: cfg.border }, right: border(cfg.border) },
|
| shading: { fill: cfg.bg, type: ShadingType.CLEAR },
|
| margins: { top: 120, bottom: 120, left: 200, right: 200 },
|
| width: { size: 9360, type: WidthType.DXA },
|
| children: [
|
| p(t(cfg.label, { bold: true, color: cfg.labelColor, size: 20 }), { spacing: { before: 0, after: 60 } }),
|
| ...lines.map(l => p(Array.isArray(l) ? l : t(l, { size: 20 }), { spacing: { before: 20, after: 20 } }))
|
| ]
|
| })] })
|
| ]
|
| });
|
| }
|
|
|
| function spacer(pts = 120) {
|
| return new Paragraph({ spacing: { before: pts, after: 0 }, children: [new TextRun("")] });
|
| }
|
|
|
| function divider() {
|
| return new Paragraph({
|
| spacing: { before: 160, after: 160 },
|
| border: { bottom: { style: BorderStyle.SINGLE, size: 4, color: C.midGray, space: 1 } },
|
| children: [new TextRun("")]
|
| });
|
| }
|
|
|
| function sectionBanner(text, bg = C.navy) {
|
| return new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [9360],
|
| rows: [new TableRow({ children: [new TableCell({
|
| borders: noBorders(),
|
| shading: { fill: bg, type: ShadingType.CLEAR },
|
| margins: { top: 160, bottom: 160, left: 280, right: 280 },
|
| width: { size: 9360, type: WidthType.DXA },
|
| children: [new Paragraph({
|
| alignment: AlignmentType.LEFT,
|
| children: [new TextRun({ text, bold: true, color: C.white, size: 32, font: "Arial" })]
|
| })]
|
| })]})],
|
| });
|
| }
|
|
|
| function twoColTable(rows, widths = [3200, 6160]) {
|
| return new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: widths,
|
| rows: rows.map((row, i) => new TableRow({
|
| children: row.map((cellText, j) => {
|
| const isHeader = i === 0;
|
| return isHeader
|
| ? hCell(cellText, C.navy, C.white, widths[j])
|
| : cell(cellText, { width: widths[j], bg: i % 2 === 0 ? C.lightGray : C.white });
|
| })
|
| }))
|
| });
|
| }
|
|
|
|
|
| function pageBreak() {
|
| return new Paragraph({ children: [new PageBreak()] });
|
| }
|
|
|
|
|
|
|
|
|
|
|
| const children = [];
|
|
|
|
|
| children.push(
|
| spacer(1440),
|
| new Paragraph({
|
| alignment: AlignmentType.CENTER,
|
| spacing: { before: 0, after: 120 },
|
| children: [new TextRun({ text: "RAG SYSTEM DESIGN", bold: true, color: C.navy, size: 72, font: "Arial" })]
|
| }),
|
| new Paragraph({
|
| alignment: AlignmentType.CENTER,
|
| spacing: { before: 0, after: 240 },
|
| children: [new TextRun({ text: "A Concept Study Guide for Engineers", color: C.blue, size: 36, font: "Arial", italics: true })]
|
| }),
|
| new Table({
|
| width: { size: 5040, type: WidthType.DXA },
|
| columnWidths: [5040],
|
| rows: [new TableRow({ children: [new TableCell({
|
| borders: noBorders(),
|
| shading: { fill: C.blue, type: ShadingType.CLEAR },
|
| margins: { top: 4, bottom: 4, left: 0, right: 0 },
|
| width: { size: 5040, type: WidthType.DXA },
|
| children: [new Paragraph({ children: [new TextRun("")] })]
|
| })] })],
|
| }),
|
| spacer(480),
|
| new Paragraph({
|
| alignment: AlignmentType.CENTER,
|
| spacing: { before: 0, after: 80 },
|
| children: [new TextRun({ text: "Covers: Chunking Β· Embedding Β· Vector Databases Β· Retrieval Β· Reranking Β· Generation", color: C.gray, size: 22, font: "Arial" })]
|
| }),
|
| spacer(1200),
|
| pageBreak()
|
| );
|
|
|
|
|
| children.push(
|
| h1("Table of Contents"),
|
| new TableOfContents("Table of Contents", {
|
| hyperlink: true,
|
| headingStyleRange: "1-3",
|
| stylesWithLevels: [
|
| { styleName: "Heading 1", level: 1 },
|
| { styleName: "Heading 2", level: 2 },
|
| { styleName: "Heading 3", level: 3 },
|
| ],
|
| }),
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 0 β The Big Picture"),
|
| spacer(),
|
| h1("How a RAG System Works End-to-End"),
|
| p(t("Before diving into each component, you need a mental model of the whole pipeline. Every decision you make β how to chunk, which embedding model to pick, how to retrieve β only makes sense in the context of what comes before and after it.", { size: 22 }), { spacing: { before: 80, after: 160 } }),
|
|
|
| h2("The Two Phases"),
|
| p(t("A RAG system has two completely separate phases that run at different times:", { size: 22 })),
|
| spacer(80),
|
|
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [4680, 4680],
|
| rows: [
|
| new TableRow({ children: [hCell("INDEXING PHASE (Offline)", C.navy), hCell("QUERY PHASE (Online / Real-time)", C.teal)] }),
|
| new TableRow({ children: [
|
| cell("Runs once (or when docs change). Takes your documents, processes them, and stores them in a vector database. The user never sees this.", { bg: C.lightBlue }),
|
| cell("Runs every time a user asks a question. Takes the question, finds relevant content, and generates an answer.", { bg: C.lightTeal }),
|
| ]}),
|
| new TableRow({ children: [
|
| cell("Parse β Chunk β Embed β Store", { bold: true, bg: C.lightBlue }),
|
| cell("Embed query β Retrieve β Rerank β Generate", { bold: true, bg: C.lightTeal }),
|
| ]}),
|
| ]
|
| }),
|
|
|
| spacer(160),
|
| callout([
|
| "Why does this matter? Because the indexing phase determines the ceiling of your system. No matter how good your retrieval logic is, if your chunks are poorly formed, the LLM cannot give a good answer. You have already done the hardest part of indexing β parsing and chunking. The decisions ahead are about embedding and retrieval."
|
| ], "tip"),
|
| spacer(),
|
|
|
| h2("The Full Pipeline at a Glance"),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [1200, 2400, 5760],
|
| rows: [
|
| new TableRow({ children: [hCell("Step", C.navy, C.white, 1200), hCell("Component", C.navy, C.white, 2400), hCell("What Happens", C.navy, C.white, 5760)] }),
|
| ...[
|
| ["1", "Parsing", "Raw document (PDF/DOCX) β structured blocks with type, content, heading level. YOU HAVE DONE THIS."],
|
| ["2", "Chunking", "Structured blocks β text segments of controlled size, each knowing their section context. YOU HAVE DONE THIS."],
|
| ["3", "Embedding (Index)", "Each chunk's text β a vector (array of floats). Similar meaning = similar vector direction."],
|
| ["4", "Vector Storage", "Vectors + metadata stored in a vector database. Supports fast similarity search."],
|
| ["5", "Query Embedding", "User's question β same embedding model β a query vector."],
|
| ["6", "Retrieval", "Query vector compared against all stored vectors. Top-N most similar chunks returned."],
|
| ["7", "Reranking", "A second, slower model re-scores the top-N chunks for true relevance. Optional but powerful."],
|
| ["8", "Generation", "Retrieved chunks assembled into a prompt. LLM reads them and writes the answer."],
|
| ].map(([step, comp, what], i) => new TableRow({ children: [
|
| cell(step, { width: 1200, align: AlignmentType.CENTER, bold: true, bg: i % 2 === 0 ? C.lightGray : C.white }),
|
| cell(comp, { width: 2400, bold: true, bg: i % 2 === 0 ? C.lightGray : C.white }),
|
| cell(what, { width: 5760, bg: i % 2 === 0 ? C.lightGray : C.white }),
|
| ]}))
|
| ]
|
| }),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 1 β Embeddings"),
|
| spacer(),
|
| h1("Embeddings: Turning Text Into Numbers"),
|
|
|
| h2("What Is an Embedding?"),
|
| p(t("An embedding is a list of floating point numbers (a vector) that represents the meaning of a piece of text. The key property is that texts with similar meanings produce vectors that point in similar directions in high-dimensional space.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
|
| p(t("For example:", { size: 22 })),
|
| spacer(60),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [4000, 5360],
|
| rows: [
|
| new TableRow({ children: [hCell("Text", C.navy, C.white, 4000), hCell("What the embedding captures", C.navy, C.white, 5360)] }),
|
| new TableRow({ children: [cell("What is the SLA for urgent tickets?", { width: 4000 }), cell("Intent: asking about time limits for a specific urgency level", { width: 5360 })] }),
|
| new TableRow({ children: [cell("Urgent/Critical - 1 hr", { width: 4000, bg: C.lightGray }), cell("Fact: a time value tied to a urgency category", { width: 5360, bg: C.lightGray })] }),
|
| ]
|
| }),
|
| spacer(120),
|
| p(t("These two texts would have very similar vectors even though they use different words β and that is why vector search can find the answer to a question even when the document doesn't use the exact same phrasing as the query.", { size: 22, italic: true })),
|
| spacer(),
|
|
|
| h2("Dimensions"),
|
| p(t("The 'size' of an embedding is called its dimensionality β the number of floats in the vector. Common sizes are 384, 768, 1024, and 1536. Higher dimensions generally capture more nuance, but cost more to store and search. You do not choose this β it is fixed by the model you pick.", { size: 22 })),
|
| spacer(),
|
|
|
| h2("The Critical Rule: Consistency"),
|
| callout([
|
| "You MUST use the same embedding model at indexing time and at query time. If you embed your chunks with model A and then embed the user's question with model B, the vectors live in different spaces and similarity scores become meaningless. This is one of the most common mistakes in production RAG systems."
|
| ], "warning"),
|
| spacer(),
|
|
|
| h2("Choosing an Embedding Model"),
|
| p(t("There are two categories of embedding models:", { size: 22 })),
|
| spacer(80),
|
|
|
| h3("API-Based Models (Hosted)"),
|
| p(t("You send text to an API endpoint and receive the vector back. You pay per token and do not need to manage any infrastructure.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2800, 1400, 1400, 3760],
|
| rows: [
|
| new TableRow({ children: [hCell("Model", C.navy, C.white, 2800), hCell("Dimensions", C.navy, C.white, 1400), hCell("Max Tokens", C.navy, C.white, 1400), hCell("Best For", C.navy, C.white, 3760)] }),
|
| ...([
|
| ["OpenAI text-embedding-3-small", "1536", "8191", "General use. Best price/performance ratio. Good starting point."],
|
| ["OpenAI text-embedding-3-large", "3072", "8191", "Higher accuracy when quality matters more than cost."],
|
| ["Cohere embed-v3", "1024", "512", "Multilingual documents. Has a native 'search' vs 'classification' mode distinction."],
|
| ]).map(([m, d, t2, b], i) => new TableRow({ children: [
|
| cell(m, { width: 2800, bg: i%2===0?C.lightGray:C.white }),
|
| cell(d, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(t2, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(b, { width: 3760, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
| spacer(160),
|
|
|
| h3("Local Models (Self-Hosted)"),
|
| p(t("You download the model weights and run them yourself. Free per-call, but requires compute and maintenance.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2800, 1400, 1400, 3760],
|
| rows: [
|
| new TableRow({ children: [hCell("Model", C.navy, C.white, 2800), hCell("Dimensions", C.navy, C.white, 1400), hCell("Max Tokens", C.navy, C.white, 1400), hCell("Notes", C.navy, C.white, 3760)] }),
|
| ...([
|
| ["BAAI/bge-large-en-v1.5", "1024", "512", "Strong open-source general model. Common baseline."],
|
| ["sentence-transformers/all-MiniLM-L6-v2", "384", "256", "Very fast, very small. Good for prototyping on CPU."],
|
| ["nomic-embed-text-v1.5", "768", "8192", "Long context local model. Rare combination."],
|
| ]).map(([m, d, t2, b], i) => new TableRow({ children: [
|
| cell(m, { width: 2800, bg: i%2===0?C.lightGray:C.white }),
|
| cell(d, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(t2, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(b, { width: 3760, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
| spacer(160),
|
|
|
| h2("Asymmetric vs Symmetric Embedding"),
|
| p(t("This concept is important and often skipped in tutorials. There are two types of embedding tasks:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2000, 3680, 3680],
|
| rows: [
|
| new TableRow({ children: [hCell("Type", C.navy, C.white, 2000), hCell("What it means", C.navy, C.white, 3680), hCell("When to use it", C.navy, C.white, 3680)] }),
|
| new TableRow({ children: [
|
| cell("Symmetric", { width: 2000, bold: true }),
|
| cell("Query and document are the same kind of text. You embed both the same way.", { width: 3680 }),
|
| cell("Semantic similarity search. Finding duplicate content. Clustering.", { width: 3680 }),
|
| ]}),
|
| new TableRow({ children: [
|
| cell("Asymmetric", { width: 2000, bold: true, bg: C.lightGray }),
|
| cell("Query is a short question. Document is a long passage that answers it. They need different treatment.", { width: 3680, bg: C.lightGray }),
|
| cell("Question answering. RAG. Finding answers to user questions in documents.", { width: 3680, bg: C.lightGray }),
|
| ]}),
|
| ]
|
| }),
|
| spacer(120),
|
| callout([
|
| "For RAG, you are almost always doing asymmetric search. Some models (like bge) support this by letting you prepend a prefix like 'Represent this sentence for searching relevant passages:' to the query. Cohere embed-v3 handles this through an explicit input_type parameter ('search_query' vs 'search_document'). Always check your model's docs for this."
|
| ], "concept"),
|
| spacer(),
|
|
|
| h2("What Gets Embedded: text_to_embed vs raw_text"),
|
| p(t("You have already made the right architectural decision here. The thing you embed is NOT the same as the thing you return to the LLM. Specifically:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 6960],
|
| rows: [
|
| new TableRow({ children: [hCell("Field", C.navy, C.white, 2400), hCell("Content and Purpose", C.navy, C.white, 6960)] }),
|
| new TableRow({ children: [
|
| cell("text_to_embed", { width: 2400, bold: true, bg: C.lightBlue }),
|
| cell("section_path prefix + chunk content. The section path gives the embedding model context about WHERE in the document this content lives. This is what Anthropic calls Contextual Retrieval.", { width: 6960, bg: C.lightBlue }),
|
| ]}),
|
| new TableRow({ children: [
|
| cell("raw_text", { width: 2400, bold: true }),
|
| cell("Just the chunk content. This is what you pass to the LLM when generating the answer. Clean, no prefixes.", { width: 6960 }),
|
| ]}),
|
| ]
|
| }),
|
| spacer(120),
|
| callout([
|
| "Why not embed raw_text alone? Because a chunk like 'Regular Response Email SLA β 2 hr' has almost no context on its own. The embedding model doesn't know this is about SLAs inside a CSS SOP about Champion Petfoods. The section path tells it that, and produces a much better vector."
|
| ], "tip"),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 2 β Vector Databases"),
|
| spacer(),
|
| h1("Vector Databases: Storing and Searching Embeddings"),
|
|
|
| h2("What Problem Does a Vector DB Solve?"),
|
| p(t("Once you have embedded all your chunks, you have thousands of vectors (arrays of floats). When a user asks a question, you embed the question and need to find the most similar vectors from your stored set. This is called Approximate Nearest Neighbor (ANN) search.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
|
| p(t("A regular database like Postgres can do this with a plugin (pgvector), but dedicated vector databases build their entire architecture around making ANN search fast, scalable, and feature-rich.", { size: 22 })),
|
| spacer(),
|
|
|
| h2("The Anatomy of a Vector DB Record"),
|
| p(t("Every record stored in a vector DB has three parts:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [1800, 2400, 5160],
|
| rows: [
|
| new TableRow({ children: [hCell("Part", C.navy, C.white, 1800), hCell("Type", C.navy, C.white, 2400), hCell("Description", C.navy, C.white, 5160)] }),
|
| new TableRow({ children: [cell("id", { width: 1800, bold: true }), cell("string / UUID", { width: 2400 }), cell("Unique identifier for this record.", { width: 5160 })] }),
|
| new TableRow({ children: [cell("vector", { width: 1800, bold: true, bg: C.lightGray }), cell("float[]", { width: 2400, bg: C.lightGray }), cell("The embedding. This is what gets searched. Never returned to the user.", { width: 5160, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("payload / metadata", { width: 1800, bold: true }), cell("dict / JSON", { width: 2400 }), cell("Everything else: raw_text, section_path, source_file, parent_elem_index, has_table, etc. This is what you read after a search match.", { width: 5160 })] }),
|
| ]
|
| }),
|
| spacer(160),
|
| callout([
|
| "The vector is the key for finding. The payload is the value you actually use. You search by vector, then read the payload of the matches. Think of it like an index in a book β the index helps you find the page, but the page contains the actual content."
|
| ], "concept"),
|
| spacer(),
|
|
|
| h2("How Similarity Search Works"),
|
| p(t("When you search a vector DB, you are not doing exact matching. You are asking: 'which stored vectors are most similar in direction to my query vector?' This is measured by a similarity metric.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2000, 2800, 4560],
|
| rows: [
|
| new TableRow({ children: [hCell("Metric", C.navy, C.white, 2000), hCell("What it measures", C.navy, C.white, 2800), hCell("When to use", C.navy, C.white, 4560)] }),
|
| new TableRow({ children: [
|
| cell("Cosine Similarity", { width: 2000, bold: true }),
|
| cell("Angle between vectors. Ignores magnitude.", { width: 2800 }),
|
| cell("Best for text. Use this for RAG. Most embedding models are trained with cosine similarity in mind.", { width: 4560 }),
|
| ]}),
|
| new TableRow({ children: [
|
| cell("Dot Product", { width: 2000, bold: true, bg: C.lightGray }),
|
| cell("Magnitude Γ cosine. Affected by vector length.", { width: 2800, bg: C.lightGray }),
|
| cell("Use when vectors are normalized (length = 1). Mathematically equivalent to cosine then.", { width: 4560, bg: C.lightGray }),
|
| ]}),
|
| new TableRow({ children: [
|
| cell("Euclidean (L2)", { width: 2000, bold: true }),
|
| cell("Straight-line distance between vector tips.", { width: 2800 }),
|
| cell("More common in image similarity. Less common for text RAG.", { width: 4560 }),
|
| ]}),
|
| ]
|
| }),
|
| spacer(),
|
|
|
| h2("Vector DB Options"),
|
| spacer(80),
|
| h3("Fully Dedicated Vector Databases"),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [1600, 1400, 2360, 3200, 800],
|
| rows: [
|
| new TableRow({ children: [hCell("DB", C.navy, C.white, 1600), hCell("Hosting", C.navy, C.white, 1400), hCell("Key Strength", C.navy, C.white, 2360), hCell("Best For", C.navy, C.white, 3200), hCell("Free?", C.navy, C.white, 800)] }),
|
| ...([
|
| ["Qdrant", "Local / Cloud", "Rich filtering on payload. Sparse+dense hybrid built-in.", "Production RAG. Local dev. Full control.", "Yes"],
|
| ["Pinecone", "Cloud only", "Fully managed. Very easy ops.", "Teams that want zero infra management.", "Paid"],
|
| ["Weaviate", "Local / Cloud", "GraphQL interface. Built-in modules.", "Complex data relationships.", "Yes"],
|
| ["Chroma", "Local", "Extremely simple API. Great for prototyping.", "Local dev and experimentation.", "Yes"],
|
| ["Milvus", "Local / Cloud", "High scale. Billion-vector support.", "Large enterprise deployments.", "Yes"],
|
| ]).map(([db, host, str, best, free], i) => new TableRow({ children: [
|
| cell(db, { width: 1600, bold: true, bg: i%2===0?C.lightGray:C.white }),
|
| cell(host, { width: 1400, bg: i%2===0?C.lightGray:C.white }),
|
| cell(str, { width: 2360, bg: i%2===0?C.lightGray:C.white }),
|
| cell(best, { width: 3200, bg: i%2===0?C.lightGray:C.white }),
|
| cell(free, { width: 800, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
| spacer(160),
|
| h3("Traditional Databases with Vector Extensions"),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2000, 7360],
|
| rows: [
|
| new TableRow({ children: [hCell("Option", C.navy, C.white, 2000), hCell("Notes", C.navy, C.white, 7360)] }),
|
| new TableRow({ children: [cell("Postgres + pgvector", { width: 2000, bold: true }), cell("Good choice if you already use Postgres. Vector search is slower than dedicated DBs at large scale, but fine for most business RAG applications under ~1M chunks.", { width: 7360 })] }),
|
| new TableRow({ children: [cell("Redis + RediSearch", { width: 2000, bold: true, bg: C.lightGray }), cell("Fast in-memory option. Good if you already have Redis and latency is critical.", { width: 7360, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("Elasticsearch", { width: 2000, bold: true }), cell("Added dense vector support. Better for hybrid search (already has strong BM25). Worth considering if you need full-text + vector in one system.", { width: 7360 })] }),
|
| ]
|
| }),
|
| spacer(160),
|
| callout([
|
| "For your use case (SOP documents, moderate scale), Qdrant running locally in Docker is the right choice. It has built-in support for hybrid search (dense + sparse in one index), rich payload filtering, and no cloud dependency. When you are ready to deploy, it also has a managed cloud tier."
|
| ], "tip"),
|
| spacer(),
|
|
|
| h2("Collections and Indexes"),
|
| p(t("In a vector DB, a 'collection' (Qdrant term) or 'index' (Pinecone term) is a named container for a set of vectors. All vectors in a collection must have the same dimensionality. You typically create one collection per document set or per embedding model.", { size: 22 })),
|
| spacer(),
|
| h2("Payload Filtering"),
|
| p(t("One of the most powerful features of dedicated vector DBs is the ability to filter by payload fields during search β not after. This means you can say 'find the top 5 chunks most similar to my query, but only from source_file = X and has_table = true'. The filter is applied during the ANN search, not on the full result set.", { size: 22 })),
|
| spacer(80),
|
| callout([
|
| "This is why storing rich metadata (source_file, section_path, parent_elem_index, has_table) on every chunk matters. These fields become first-class filter parameters at query time. For example, if the user says 'in the Champion Petfoods SOP' you can filter to just that file."
|
| ], "concept"),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 3 β Retrieval"),
|
| spacer(),
|
| h1("Retrieval: Finding the Right Chunks"),
|
| p(t("Retrieval is the step that connects the user's question to your stored knowledge. The goal is simple: given a query, return the most relevant chunks. But there are multiple strategies with meaningfully different strengths.", { size: 22 }), { spacing: { before: 80, after: 160 } }),
|
|
|
| h2("Strategy 1: Dense Retrieval (Vector Search)"),
|
| p(t("This is what most people mean when they say 'RAG retrieval'. You embed the query and find the nearest vectors in your collection by cosine similarity.", { size: 22 })),
|
| spacer(80),
|
| callout([
|
| [t("Strength: ", { bold: true, size: 20 }), t("Finds semantically similar content even when the wording is completely different. Query: 'how long do I have to respond to a critical issue' will find the chunk containing 'Urgent/Critical - 1 hr' because the meanings are similar.", { size: 20 })],
|
| [t("Weakness: ", { bold: true, size: 20 }), t("Fails at exact keyword matching. If someone searches for a specific code, product name, or ID (like 'project code 79478-CA'), vector search can miss it because the meaning space doesn't preserve exact strings well.", { size: 20 })],
|
| ], "note"),
|
| spacer(),
|
|
|
| h2("Strategy 2: Sparse Retrieval (BM25 / Keyword Search)"),
|
| p(t("BM25 is the algorithm that powers traditional keyword search engines. It scores documents based on term frequency β how often query words appear in the chunk, weighted by how rare those words are across all chunks.", { size: 22 })),
|
| spacer(80),
|
| callout([
|
| [t("Strength: ", { bold: true, size: 20 }), t("Exact and near-exact keyword matching. Perfect for product codes, person names, specific IDs, and domain jargon that might not appear often in embedding training data.", { size: 20 })],
|
| [t("Weakness: ", { bold: true, size: 20 }), t("Completely blind to semantics. 'Urgent ticket response time' would score zero against a chunk that says 'Critical issue SLA' even though they mean the same thing.", { size: 20 })],
|
| ], "note"),
|
| spacer(),
|
|
|
| h2("Strategy 3: Hybrid Retrieval (Dense + Sparse)"),
|
| p(t("The current production standard. Run both dense and sparse retrieval in parallel, then merge the results. The merging step is called Reciprocal Rank Fusion (RRF).", { size: 22 })),
|
| spacer(80),
|
|
|
| h3("Reciprocal Rank Fusion (RRF)"),
|
| p(t("RRF merges two ranked lists (one from dense, one from sparse) into a single ranked list without needing to normalize scores across different scoring systems. The formula for each document's RRF score is:", { size: 22 })),
|
| spacer(60),
|
| p([t("score(d) = Ξ£ 1 / (k + rank(d))", { bold: true, size: 24, color: C.navy })], { align: AlignmentType.CENTER }),
|
| spacer(60),
|
| p(t("Where k is a constant (usually 60) and rank(d) is the document's position in each ranked list. A document that ranks #1 in both lists will score highest. A document that ranks #20 in one list but doesn't appear in the other will score lower than one that ranks #5 in both.", { size: 22 })),
|
| spacer(120),
|
| callout([
|
| "RRF is elegant because it is rank-based, not score-based. You do not need to worry that a BM25 score of 12.4 and a cosine similarity of 0.87 are on completely different scales. You only care about position in each list."
|
| ], "concept"),
|
| spacer(),
|
|
|
| h2("Strategy 4: Small-to-Large (Parent Document) Retrieval"),
|
| p(t("This is the pattern you have already designed for with your parent_elem_index field. The idea:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [4680, 4680],
|
| rows: [
|
| new TableRow({ children: [hCell("Small chunks (indexed)", C.navy), hCell("Large chunks (returned to LLM)", C.teal)] }),
|
| new TableRow({ children: [
|
| cell("128β256 tokens. Fine-grained. Match questions precisely because each chunk is about one specific thing.", { bg: C.lightBlue }),
|
| cell("The full section (all siblings under the same parent). 400β800 tokens. Give the LLM full context to answer correctly.", { bg: C.lightTeal }),
|
| ]}),
|
| ]
|
| }),
|
| spacer(120),
|
| p(t("Why both? Because precision and context are in tension. A small chunk matches questions precisely (high recall), but a small chunk often lacks enough context for the LLM to give a complete answer. By returning the full section, the LLM sees everything around the matched chunk.", { size: 22 })),
|
| spacer(80),
|
| callout([
|
| "In your implementation: vector search returns a chunk with parent_elem_index = 1. You then fetch ALL chunks where parent_elem_index == 1, sort by chunk_index, and join them. This reconstructed section goes into the LLM prompt β not the individual matched chunk."
|
| ], "tip"),
|
| spacer(),
|
|
|
| h2("Strategy 5: Multi-Query Retrieval"),
|
| p(t("A user's question is a single phrasing of their intent, but the answer might be stored under different phrasing. Multi-query retrieval generates multiple reformulations of the question using an LLM, runs retrieval for each, and merges the results.", { size: 22 })),
|
| spacer(80),
|
| p([t("Example: ", { bold: true, size: 22 }), t("User asks 'who do I contact for IT problems?'. Multi-query generates:", { size: 22 })]),
|
| bullet("'IT support contact information'"),
|
| bullet("'technology helpdesk details'"),
|
| bullet("'helpdesk URL for technical issues'"),
|
| p(t("Each gets retrieved separately, then deduplicated and merged. This significantly improves recall for queries that could be phrased many ways.", { size: 22 })),
|
| spacer(),
|
|
|
| h2("top_k: How Many Chunks to Retrieve?"),
|
| p(t("Every retrieval call asks for the top-k most similar chunks. This is a key hyperparameter:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [1600, 2800, 4960],
|
| rows: [
|
| new TableRow({ children: [hCell("top_k value", C.navy, C.white, 1600), hCell("Trade-off", C.navy, C.white, 2800), hCell("Risk", C.navy, C.white, 4960)] }),
|
| new TableRow({ children: [cell("3β5", { width: 1600, bold: true }), cell("Fast, cheap, focused", { width: 2800 }), cell("May miss relevant chunks if the answer is spread across multiple sections.", { width: 4960 })] }),
|
| new TableRow({ children: [cell("10β20", { width: 1600, bold: true, bg: C.lightGray }), cell("More coverage", { width: 2800, bg: C.lightGray }), cell("LLM context window fills up faster. More noise if retrieval quality is low.", { width: 4960, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("50+", { width: 1600, bold: true }), cell("High recall", { width: 2800 }), cell("Context rot: LLMs perform worse when the context is very long and the answer is buried.", { width: 4960 })] }),
|
| ]
|
| }),
|
| spacer(120),
|
| callout([
|
| "Practical starting point: retrieve top 10 from vector search, rerank them, pass top 3β5 to the LLM. The reranker is what lets you safely retrieve more candidates without sending all of them to the LLM."
|
| ], "tip"),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 4 β Reranking"),
|
| spacer(),
|
| h1("Reranking: Sorting Candidates by True Relevance"),
|
|
|
| h2("Why Retrieval Alone Is Not Enough"),
|
| p(t("Vector search is fast but approximate. It finds chunks that are in the same semantic neighborhood as your query, but the ranking within those results is imprecise. The top result is not always the most relevant β sometimes the 4th or 5th result is actually a better answer.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
|
| p(t("Reranking solves this by applying a more expensive but more accurate model to re-score the top-N candidates from retrieval.", { size: 22 })),
|
| spacer(),
|
|
|
| h2("How a Reranker Works"),
|
| p(t("A reranker is a cross-encoder model. Unlike a bi-encoder embedding model (which encodes query and document independently), a cross-encoder takes the query and a document together as a single input and outputs a relevance score.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [3000, 3180, 3180],
|
| rows: [
|
| new TableRow({ children: [hCell("", C.navy, C.white, 3000), hCell("Bi-encoder (Embedding model)", C.navy, C.white, 3180), hCell("Cross-encoder (Reranker)", C.navy, C.white, 3180)] }),
|
| new TableRow({ children: [cell("How it works", { width: 3000, bold: true }), cell("Encodes query and document independently. Compares vectors.", { width: 3180 }), cell("Reads query AND document together. Produces a single relevance score.", { width: 3180 })] }),
|
| new TableRow({ children: [cell("Speed", { width: 3000, bold: true, bg: C.lightGray }), cell("Fast. Can search millions of vectors in milliseconds.", { width: 3180, bg: C.lightGray }), cell("Slow. Must process query + each candidate individually.", { width: 3180, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("Accuracy", { width: 3000, bold: true }), cell("Good. Misses nuance because query and doc context are separate.", { width: 3180 }), cell("Better. Sees the interaction between query and document explicitly.", { width: 3180 })] }),
|
| new TableRow({ children: [cell("Use case", { width: 3000, bold: true, bg: C.lightGray }), cell("First-stage retrieval. Fast candidate selection.", { width: 3180, bg: C.lightGray }), cell("Second-stage rescoring. Applied only to top-N candidates.", { width: 3180, bg: C.lightGray })] }),
|
| ]
|
| }),
|
| spacer(160),
|
| callout([
|
| "The standard pipeline: retrieve top 20 candidates with fast vector search β rerank to top 5 with a cross-encoder β pass top 5 to the LLM. You get the recall of retrieving 20 and the precision of sending only the best 5."
|
| ], "tip"),
|
| spacer(),
|
|
|
| h2("Reranker Options"),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 1600, 5360],
|
| rows: [
|
| new TableRow({ children: [hCell("Model", C.navy, C.white, 2400), hCell("Type", C.navy, C.white, 1600), hCell("Notes", C.navy, C.white, 5360)] }),
|
| ...([
|
| ["Cohere Rerank 3", "API", "Best quality. Supports 100+ languages. Accepts up to 10,000 token documents. Pay per call."],
|
| ["BGE-Reranker-Large", "Local", "Strong open-source option. Run on your own GPU/CPU. BAAI family."],
|
| ["cross-encoder/ms-marco-MiniLM", "Local", "Small and fast. Good for CPU-only environments. Slightly lower quality."],
|
| ["Jina Reranker v2", "API / Local", "Good multilingual support. Can run locally or via API."],
|
| ]).map(([m, type, notes], i) => new TableRow({ children: [
|
| cell(m, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
|
| cell(type, { width: 1600, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(notes, { width: 5360, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
| spacer(),
|
|
|
| h2("Is Reranking Always Necessary?"),
|
| p(t("No. It is a quality-vs-latency trade-off. Add it when:", { size: 22 })),
|
| bullet("Your retrieval is returning the right sections but not always in the right order."),
|
| bullet("Your queries are complex or multi-part."),
|
| bullet("Accuracy matters more than response time."),
|
| spacer(80),
|
| p(t("Skip it initially. Build without reranking first, measure quality, then add it when you identify it as the bottleneck.", { size: 22 })),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 5 β Generation"),
|
| spacer(),
|
| h1("Generation: From Retrieved Chunks to Final Answer"),
|
|
|
| h2("The Prompt Structure"),
|
| p(t("The generation step takes the retrieved (and optionally reranked) chunks, assembles them into a prompt, and calls an LLM. The prompt structure is the main lever you control here.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2000, 7360],
|
| rows: [
|
| new TableRow({ children: [hCell("Prompt Section", C.navy, C.white, 2000), hCell("Content", C.navy, C.white, 7360)] }),
|
| new TableRow({ children: [cell("System prompt", { width: 2000, bold: true }), cell("Instructions for the LLM. Role, constraints, behavior. Example: 'You are a helpful assistant. Answer only using the provided context. If the answer is not in the context, say so explicitly.'", { width: 7360 })] }),
|
| new TableRow({ children: [cell("Context block", { width: 2000, bold: true, bg: C.lightGray }), cell("The retrieved chunks, formatted with their section paths as headers. Each chunk is clearly labeled with its source section so the LLM can cite it.", { width: 7360, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("User question", { width: 2000, bold: true }), cell("The original user query, unchanged.", { width: 7360 })] }),
|
| ]
|
| }),
|
| spacer(),
|
|
|
| h2("Context Rot: A Real Problem"),
|
| p(t("Research has consistently shown that LLMs perform worse when the relevant information is buried in the middle of a long context. Performance is highest when the relevant chunk is at the beginning or end of the context block. This is called 'lost in the middle'.", { size: 22 })),
|
| spacer(80),
|
| callout([
|
| "Practical implication: after reranking, order your chunks so the highest-scoring ones appear first in the context block, not in the middle. Also: do not send more context than necessary. 3 high-quality chunks often outperform 10 mediocre ones."
|
| ], "warning"),
|
| spacer(),
|
|
|
| h2("Grounding and Hallucination Prevention"),
|
| p(t("The system prompt is your primary tool for keeping the LLM grounded in your documents. Key instructions to include:", { size: 22 })),
|
| bullet("'Answer only based on the provided context documents.'"),
|
| bullet("'If the context does not contain enough information to answer, say: I could not find this in the provided documents.'"),
|
| bullet("'Do not make up information. Do not use your general knowledge.'"),
|
| bullet("'Cite the section you are drawing from when possible.'"),
|
| spacer(80),
|
| callout([
|
| "No prompt instruction eliminates hallucination entirely, but clear grounding instructions reduce it significantly. The more specific you are about what the LLM should do when it doesn't know, the more reliable the behavior."
|
| ], "note"),
|
| spacer(),
|
|
|
| h2("LLM Options for Generation"),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 1600, 1360, 4000],
|
| rows: [
|
| new TableRow({ children: [hCell("Model", C.navy, C.white, 2400), hCell("Context Window", C.navy, C.white, 1600), hCell("Cost", C.navy, C.white, 1360), hCell("Notes", C.navy, C.white, 4000)] }),
|
| ...([
|
| ["Claude Sonnet 4", "200K tokens", "Medium", "Excellent instruction following. Great for RAG. Strong at staying grounded in context."],
|
| ["GPT-4o", "128K tokens", "Medium", "Strong all-around. Good at following complex system prompts."],
|
| ["GPT-4o mini", "128K tokens", "Low", "Good quality at lower cost. Reasonable grounding."],
|
| ["Llama 3.1 70B", "128K tokens", "Self-host", "Open source. Good quality. Requires GPU infrastructure."],
|
| ]).map(([m, ctx, cost, notes], i) => new TableRow({ children: [
|
| cell(m, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
|
| cell(ctx, { width: 1600, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(cost, { width: 1360, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(notes, { width: 4000, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 6 β Evaluation"),
|
| spacer(),
|
| h1("Evaluation: Knowing If Your RAG System Is Working"),
|
|
|
| h2("Why Evaluation Is Non-Negotiable"),
|
| p(t("Without evaluation, you are flying blind. You might swap embedding models, change chunk sizes, or add reranking β and have no way of knowing if things got better or worse. A simple evaluation harness built early saves enormous time later.", { size: 22 })),
|
| spacer(),
|
|
|
| h2("The Three Things That Can Fail"),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 3480, 3480],
|
| rows: [
|
| new TableRow({ children: [hCell("Failure Mode", C.navy, C.white, 2400), hCell("Symptom", C.navy, C.white, 3480), hCell("Root Cause", C.navy, C.white, 3480)] }),
|
| new TableRow({ children: [cell("Retrieval failure", { width: 2400, bold: true }), cell("The right chunk never made it into the context at all.", { width: 3480 }), cell("Chunk boundaries wrong. Embedding model poor fit. top_k too low.", { width: 3480 })] }),
|
| new TableRow({ children: [cell("Context failure", { width: 2400, bold: true, bg: C.lightGray }), cell("The right chunk was retrieved but the LLM couldn't use it properly.", { width: 3480, bg: C.lightGray }), cell("Chunk too small, context rot, section path missing.", { width: 3480, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("Generation failure", { width: 2400, bold: true }), cell("Context was correct but the answer was wrong or hallucinated.", { width: 3480 }), cell("Weak system prompt, wrong LLM, temperature too high.", { width: 3480 })] }),
|
| ]
|
| }),
|
| spacer(160),
|
| callout([
|
| "When your RAG agent gives a wrong answer, always ask: was the right chunk retrieved? Check the context that was actually passed to the LLM before blaming generation. Most of the time, retrieval is the culprit."
|
| ], "warning"),
|
| spacer(),
|
|
|
| h2("RAGAS: The Standard Evaluation Framework"),
|
| p(t("RAGAS (RAG Assessment) is an open-source framework that measures RAG quality without requiring hand-labeled ground truth. It uses an LLM to evaluate the quality of retrieved context and generated answers.", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 6960],
|
| rows: [
|
| new TableRow({ children: [hCell("Metric", C.navy, C.white, 2400), hCell("What it measures", C.navy, C.white, 6960)] }),
|
| new TableRow({ children: [cell("Context Precision", { width: 2400, bold: true }), cell("Of the chunks retrieved, how many were actually relevant? High precision = low noise in context.", { width: 6960 })] }),
|
| new TableRow({ children: [cell("Context Recall", { width: 2400, bold: true, bg: C.lightGray }), cell("Of all the relevant chunks in the knowledge base, how many were retrieved? High recall = not missing important chunks.", { width: 6960, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("Faithfulness", { width: 2400, bold: true }), cell("Is the generated answer factually consistent with the retrieved context? Measures hallucination.", { width: 6960 })] }),
|
| new TableRow({ children: [cell("Answer Relevancy", { width: 2400, bold: true, bg: C.lightGray }), cell("Does the generated answer actually address the question asked? Measures completeness.", { width: 6960, bg: C.lightGray })] }),
|
| ]
|
| }),
|
| spacer(160),
|
|
|
| h2("Minimum Viable Evaluation (Before Using RAGAS)"),
|
| p(t("Before setting up a full evaluation framework, do this manually with 10β15 questions from your actual documents:", { size: 22 })),
|
| bullet("Write 10 questions whose answers exist clearly in your SOP documents."),
|
| bullet("For each question, inspect the retrieved chunks β did the right section appear?"),
|
| bullet("For each question, read the generated answer β is it correct? Does it hallucinate?"),
|
| bullet("Classify each failure as retrieval failure, context failure, or generation failure."),
|
| bullet("The failure distribution tells you exactly what to fix first."),
|
| spacer(80),
|
| callout([
|
| "This manual inspection of 10 questions will teach you more about your system than any automated metric. Do it before writing more code."
|
| ], "tip"),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 7 β Your Decision Guide"),
|
| spacer(),
|
| h1("Making Decisions for Your Specific System"),
|
|
|
| h2("What You Have Already Built"),
|
| callout([
|
| "Parser: DOCX β structured blocks with type, heading_level, elem_index, page_index.",
|
| "Parent chain: every block has parent_id pointing to its heading ancestor.",
|
| "Section-aware chunks: blocks grouped by parent, section_path prepended to text_to_embed.",
|
| "Table handling: oversized tables get LLM summaries; summary is embedded, raw table is stored.",
|
| "Two-field design: text_to_embed (goes to embedding model) vs raw_text (goes to LLM).",
|
| ], "tip"),
|
| spacer(),
|
|
|
| h2("Decision 1: Which Embedding Model?"),
|
| p(t("Recommendation: Start with OpenAI text-embedding-3-small.", { size: 22, bold: true })),
|
| bullet("It has an 8191 token max β your chunks (under 500 tokens + section path prefix) are well within limits."),
|
| bullet("The 1536-dimensional vectors offer strong quality for English-language business documents."),
|
| bullet("It is the most common choice, meaning most tutorials, examples, and integrations assume it."),
|
| bullet("If you later need better quality: upgrade to text-embedding-3-large (same API, just a model name change)."),
|
| bullet("If you later need no API dependency: switch to BAAI/bge-large-en-v1.5 locally."),
|
| spacer(80),
|
| callout(["Do not over-engineer this choice. The embedding model is the easiest thing to swap later. Chunk quality matters far more, and you have already invested there."], "note"),
|
| spacer(),
|
|
|
| h2("Decision 2: Which Vector Database?"),
|
| p(t("Recommendation: Qdrant running locally via Docker.", { size: 22, bold: true })),
|
| bullet("Free, no account required, runs on your machine."),
|
| bullet("Supports hybrid search (dense + sparse BM25) natively β you will want this."),
|
| bullet("Rich payload filtering β you can filter by source_file, has_table, parent_elem_index."),
|
| bullet("Same API works when you eventually move to Qdrant Cloud."),
|
| spacer(),
|
|
|
| h2("Decision 3: Which Retrieval Strategy?"),
|
| p(t("Recommendation: Start with dense retrieval only, then add BM25 hybrid once baseline works.", { size: 22, bold: true })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2000, 7360],
|
| rows: [
|
| new TableRow({ children: [hCell("Phase", C.navy, C.white, 2000), hCell("What to implement", C.navy, C.white, 7360)] }),
|
| new TableRow({ children: [cell("Phase 1", { width: 2000, bold: true }), cell("Dense vector search only. top_k = 10. Return parent sections (small-to-large). No reranker.", { width: 7360 })] }),
|
| new TableRow({ children: [cell("Phase 2", { width: 2000, bold: true, bg: C.lightGray }), cell("Add BM25 sparse search. Merge with RRF. Especially valuable for queries with specific names, codes, email addresses.", { width: 7360, bg: C.lightGray })] }),
|
| new TableRow({ children: [cell("Phase 3", { width: 2000, bold: true }), cell("Add Cohere Rerank. Retrieve top 20, rerank to top 5, pass top 5 to LLM.", { width: 7360 })] }),
|
| ]
|
| }),
|
| spacer(),
|
|
|
| h2("Decision 4: Which LLM for Generation?"),
|
| p(t("Recommendation: Claude Sonnet 4 or GPT-4o. Both follow grounding instructions well.", { size: 22, bold: true })),
|
| bullet("Set temperature to 0 for factual Q&A β you want deterministic, grounded answers."),
|
| bullet("Write a strict system prompt that forbids answering outside the provided context."),
|
| bullet("Include section paths as headers in your context block so the LLM can cite sources."),
|
| spacer(),
|
|
|
| h2("The Build Order"),
|
| p(t("Given where you are right now, here is the recommended sequence:", { size: 22 })),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [800, 2800, 5760],
|
| rows: [
|
| new TableRow({ children: [hCell("#", C.navy, C.white, 800), hCell("Task", C.navy, C.white, 2800), hCell("Why this order", C.navy, C.white, 5760)] }),
|
| ...([
|
| ["1", "Embed chunks + store in Qdrant", "Nothing else works until you have searchable vectors."],
|
| ["2", "Build basic dense retrieval + parent expansion", "This alone can answer most questions. Establish your baseline."],
|
| ["3", "Evaluate manually with 10 real questions", "Find out where you are failing before adding complexity."],
|
| ["4", "Add BM25 hybrid + RRF", "Fixes failures on specific names, codes, and exact terms."],
|
| ["5", "Add reranker", "Fixes ordering failures β right chunks retrieved but ranked wrong."],
|
| ["6", "Add multi-query expansion", "Fixes cases where phrasing mismatch causes miss."],
|
| ["7", "Set up RAGAS evaluation", "Automate quality tracking so regressions are caught."],
|
| ]).map(([n, task, why], i) => new TableRow({ children: [
|
| cell(n, { width: 800, bold: true, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
|
| cell(task, { width: 2800, bold: true, bg: i%2===0?C.lightGray:C.white }),
|
| cell(why, { width: 5760, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| }),
|
| spacer(160),
|
| callout([
|
| "The biggest mistake engineers make in RAG: adding complexity (reranking, multi-query, graph RAG) before establishing and measuring a baseline. Build the simplest thing that could work. Measure it. Then add exactly the complexity that fixes the specific failure mode you observe."
|
| ], "warning"),
|
|
|
| pageBreak()
|
| );
|
|
|
|
|
|
|
|
|
| children.push(
|
| sectionBanner("PART 8 β Glossary"),
|
| spacer(),
|
| h1("Glossary of Key Terms"),
|
| spacer(80),
|
| new Table({
|
| width: { size: 9360, type: WidthType.DXA },
|
| columnWidths: [2400, 6960],
|
| rows: [
|
| new TableRow({ children: [hCell("Term", C.navy, C.white, 2400), hCell("Definition", C.navy, C.white, 6960)] }),
|
| ...([
|
| ["ANN (Approximate Nearest Neighbor)", "An algorithm for finding vectors that are close to a query vector, without checking every single stored vector. Trades a small amount of accuracy for large speed gains."],
|
| ["BM25", "Best Match 25. A keyword-based ranking algorithm. Scores documents by term frequency weighted by inverse document frequency (IDF). The basis of traditional search engines."],
|
| ["Bi-encoder", "An embedding model architecture where query and document are encoded independently, then compared by vector similarity. Fast. Used for first-stage retrieval."],
|
| ["Chunking", "The process of splitting documents into smaller text segments for embedding. The quality of chunking is the single largest determinant of RAG quality."],
|
| ["Contextual Retrieval", "Anthropic's technique of prepending a chunk's document context (section path, summary) to the chunk text before embedding, so the vector captures where the chunk lives in the document."],
|
| ["Context Rot", "The phenomenon where LLM performance degrades when relevant information is buried in the middle of a long context. Also called 'lost in the middle'."],
|
| ["Cosine Similarity", "A measure of the angle between two vectors. Range: -1 to 1. Used as the similarity metric in most RAG vector searches. 1 = identical direction, 0 = orthogonal, -1 = opposite."],
|
| ["Cross-encoder", "A reranker model architecture that takes query and document as a single input and outputs a relevance score. More accurate than bi-encoders but too slow for first-stage retrieval."],
|
| ["Dense Retrieval", "Retrieval based on vector similarity (embeddings). Finds semantically similar content even when wording differs."],
|
| ["Embedding", "A vector (list of floats) that represents the meaning of a text. Produced by an embedding model. Similar texts produce similar vectors."],
|
| ["Faithfulness", "A RAGAS metric measuring whether a generated answer is factually grounded in the retrieved context (i.e., not hallucinated)."],
|
| ["Hallucination", "When an LLM generates plausible-sounding but factually incorrect information not supported by its context."],
|
| ["Hybrid Retrieval", "Combining dense (vector) and sparse (BM25) retrieval results into a single ranked list, usually via Reciprocal Rank Fusion."],
|
| ["Indexing", "The offline phase of RAG: parsing, chunking, embedding, and storing documents in a vector database."],
|
| ["Payload", "The metadata stored alongside a vector in a vector DB. Contains raw_text, section_path, source_file, etc. Retrieved after a search match."],
|
| ["RAGAS", "RAG Assessment. An open-source framework for evaluating RAG systems using LLM-based metrics (context precision, recall, faithfulness, answer relevancy)."],
|
| ["Reciprocal Rank Fusion (RRF)", "An algorithm for merging ranked lists from different retrieval systems without needing to normalize scores. Combines dense and sparse results by rank position."],
|
| ["Reranking", "A second-stage scoring step that re-orders the top-N retrieved candidates using a cross-encoder model for higher accuracy before passing to the LLM."],
|
| ["Section Path", "The full heading ancestry of a chunk expressed as a breadcrumb (e.g., 'SOP Title > Section Name'). Prepended to chunks before embedding for contextual retrieval."],
|
| ["Small-to-Large Retrieval", "A pattern where small chunks are indexed for precise retrieval, but the full parent section is fetched and sent to the LLM for generation."],
|
| ["Sparse Retrieval", "Retrieval based on keyword matching (BM25). Excels at exact term matching but blind to semantic similarity."],
|
| ["top_k", "The number of candidates to retrieve from the vector database. A hyperparameter that controls the recall-precision trade-off."],
|
| ["Vector Database", "A database optimized for storing and searching high-dimensional vectors. Supports approximate nearest neighbor search and payload filtering."],
|
| ]).map(([term, def], i) => new TableRow({ children: [
|
| cell(term, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
|
| cell(def, { width: 6960, bg: i%2===0?C.lightGray:C.white }),
|
| ]}))
|
| ]
|
| })
|
| );
|
|
|
|
|
|
|
|
|
| const doc = new Document({
|
| numbering: {
|
| config: [
|
| {
|
| reference: "bullets",
|
| levels: [
|
| { level: 0, format: LevelFormat.BULLET, text: "β’", alignment: AlignmentType.LEFT,
|
| style: { paragraph: { indent: { left: 720, hanging: 360 } }, run: { font: "Arial", size: 22 } } },
|
| { level: 1, format: LevelFormat.BULLET, text: "β¦", alignment: AlignmentType.LEFT,
|
| style: { paragraph: { indent: { left: 1080, hanging: 360 } }, run: { font: "Arial", size: 22 } } },
|
| ]
|
| }
|
| ]
|
| },
|
| styles: {
|
| default: { document: { run: { font: "Arial", size: 22, color: C.black } } },
|
| paragraphStyles: [
|
| { id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true,
|
| run: { size: 36, bold: true, color: C.navy, font: "Arial" },
|
| paragraph: { spacing: { before: 360, after: 160 }, outlineLevel: 0 } },
|
| { id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true,
|
| run: { size: 28, bold: true, color: C.blue, font: "Arial" },
|
| paragraph: { spacing: { before: 280, after: 120 }, outlineLevel: 1 } },
|
| { id: "Heading3", name: "Heading 3", basedOn: "Normal", next: "Normal", quickFormat: true,
|
| run: { size: 24, bold: true, color: C.teal, font: "Arial" },
|
| paragraph: { spacing: { before: 200, after: 80 }, outlineLevel: 2 } },
|
| ]
|
| },
|
| sections: [{
|
| properties: {
|
| page: {
|
| size: { width: 12240, height: 15840 },
|
| margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 }
|
| }
|
| },
|
| children
|
| }]
|
| });
|
|
|
| Packer.toBuffer(doc).then(buf => {
|
| fs.writeFileSync("/mnt/user-data/outputs/RAG_System_Design_Study_Guide.docx", buf);
|
| console.log("Done");
|
| }); |