File size: 65,524 Bytes
ca7c620 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 | const {
Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell,
HeadingLevel, AlignmentType, BorderStyle, WidthType, ShadingType,
TableOfContents, PageBreak, LevelFormat, UnderlineType
} = require('docx');
const fs = require('fs');
// ββ colour palette ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const C = {
navy: "1B3A5C",
blue: "2E75B6",
lightBlue: "D6E4F0",
teal: "1A7A6E",
lightTeal: "D4EFEC",
amber: "C45911",
lightAmber:"FCE9D9",
purple: "5B3A8C",
lightPurple:"E8DEFF",
gray: "595959",
lightGray: "F2F2F2",
midGray: "D9D9D9",
white: "FFFFFF",
black: "000000",
};
// ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const border = (color = C.midGray) => ({ style: BorderStyle.SINGLE, size: 1, color });
const borders = (color = C.midGray) => ({ top: border(color), bottom: border(color), left: border(color), right: border(color) });
const noBorder = () => ({ style: BorderStyle.NONE, size: 0, color: C.white });
const noBorders = () => ({ top: noBorder(), bottom: noBorder(), left: noBorder(), right: noBorder() });
function cell(text, opts = {}) {
const {
bold = false, color = C.black, bg = C.white, width = 4680,
italic = false, size = 20, align = AlignmentType.LEFT, shade = true
} = opts;
return new TableCell({
borders: borders(C.midGray),
width: { size: width, type: WidthType.DXA },
shading: shade ? { fill: bg, type: ShadingType.CLEAR } : undefined,
margins: { top: 80, bottom: 80, left: 140, right: 140 },
children: [new Paragraph({
alignment: align,
children: [new TextRun({ text, bold, color, italics: italic, size, font: "Arial" })]
})]
});
}
function hCell(text, bg = C.navy, textColor = C.white, width = 4680) {
return cell(text, { bold: true, color: textColor, bg, width, size: 20 });
}
function p(runs, opts = {}) {
const { spacing = { before: 80, after: 80 }, align } = opts;
return new Paragraph({
alignment: align,
spacing,
children: Array.isArray(runs) ? runs : [runs],
});
}
function t(text, opts = {}) {
const { bold = false, italic = false, color = C.black, size = 22, underline } = opts;
return new TextRun({ text, bold, italics: italic, color, size, font: "Arial", underline });
}
function h1(text) {
return new Paragraph({
heading: HeadingLevel.HEADING_1,
spacing: { before: 360, after: 160 },
children: [new TextRun({ text, bold: true, color: C.navy, size: 36, font: "Arial" })]
});
}
function h2(text) {
return new Paragraph({
heading: HeadingLevel.HEADING_2,
spacing: { before: 280, after: 120 },
children: [new TextRun({ text, bold: true, color: C.blue, size: 28, font: "Arial" })]
});
}
function h3(text) {
return new Paragraph({
heading: HeadingLevel.HEADING_3,
spacing: { before: 200, after: 80 },
children: [new TextRun({ text, bold: true, color: C.teal, size: 24, font: "Arial" })]
});
}
function bullet(text, level = 0, color = C.black) {
return new Paragraph({
numbering: { reference: "bullets", level },
spacing: { before: 40, after: 40 },
children: [new TextRun({ text, color, size: 22, font: "Arial" })]
});
}
function callout(lines, type = "note") {
const configs = {
note: { bg: C.lightBlue, border: C.blue, label: "π NOTE", labelColor: C.blue },
tip: { bg: C.lightTeal, border: C.teal, label: "β
KEY IDEA", labelColor: C.teal },
warning: { bg: C.lightAmber, border: C.amber, label: "β οΈ WATCH OUT", labelColor: C.amber },
concept: { bg: C.lightPurple, border: C.purple, label: "π§ CONCEPT", labelColor: C.purple },
};
const cfg = configs[type];
return new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [9360],
rows: [
new TableRow({ children: [new TableCell({
borders: { top: border(cfg.border), bottom: border(cfg.border), left: { style: BorderStyle.SINGLE, size: 12, color: cfg.border }, right: border(cfg.border) },
shading: { fill: cfg.bg, type: ShadingType.CLEAR },
margins: { top: 120, bottom: 120, left: 200, right: 200 },
width: { size: 9360, type: WidthType.DXA },
children: [
p(t(cfg.label, { bold: true, color: cfg.labelColor, size: 20 }), { spacing: { before: 0, after: 60 } }),
...lines.map(l => p(Array.isArray(l) ? l : t(l, { size: 20 }), { spacing: { before: 20, after: 20 } }))
]
})] })
]
});
}
function spacer(pts = 120) {
return new Paragraph({ spacing: { before: pts, after: 0 }, children: [new TextRun("")] });
}
function divider() {
return new Paragraph({
spacing: { before: 160, after: 160 },
border: { bottom: { style: BorderStyle.SINGLE, size: 4, color: C.midGray, space: 1 } },
children: [new TextRun("")]
});
}
function sectionBanner(text, bg = C.navy) {
return new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [9360],
rows: [new TableRow({ children: [new TableCell({
borders: noBorders(),
shading: { fill: bg, type: ShadingType.CLEAR },
margins: { top: 160, bottom: 160, left: 280, right: 280 },
width: { size: 9360, type: WidthType.DXA },
children: [new Paragraph({
alignment: AlignmentType.LEFT,
children: [new TextRun({ text, bold: true, color: C.white, size: 32, font: "Arial" })]
})]
})]})],
});
}
function twoColTable(rows, widths = [3200, 6160]) {
return new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: widths,
rows: rows.map((row, i) => new TableRow({
children: row.map((cellText, j) => {
const isHeader = i === 0;
return isHeader
? hCell(cellText, C.navy, C.white, widths[j])
: cell(cellText, { width: widths[j], bg: i % 2 === 0 ? C.lightGray : C.white });
})
}))
});
}
// ββ page break βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
function pageBreak() {
return new Paragraph({ children: [new PageBreak()] });
}
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// DOCUMENT CONTENT
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const children = [];
// ββ COVER ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
spacer(1440),
new Paragraph({
alignment: AlignmentType.CENTER,
spacing: { before: 0, after: 120 },
children: [new TextRun({ text: "RAG SYSTEM DESIGN", bold: true, color: C.navy, size: 72, font: "Arial" })]
}),
new Paragraph({
alignment: AlignmentType.CENTER,
spacing: { before: 0, after: 240 },
children: [new TextRun({ text: "A Concept Study Guide for Engineers", color: C.blue, size: 36, font: "Arial", italics: true })]
}),
new Table({
width: { size: 5040, type: WidthType.DXA },
columnWidths: [5040],
rows: [new TableRow({ children: [new TableCell({
borders: noBorders(),
shading: { fill: C.blue, type: ShadingType.CLEAR },
margins: { top: 4, bottom: 4, left: 0, right: 0 },
width: { size: 5040, type: WidthType.DXA },
children: [new Paragraph({ children: [new TextRun("")] })]
})] })],
}),
spacer(480),
new Paragraph({
alignment: AlignmentType.CENTER,
spacing: { before: 0, after: 80 },
children: [new TextRun({ text: "Covers: Chunking Β· Embedding Β· Vector Databases Β· Retrieval Β· Reranking Β· Generation", color: C.gray, size: 22, font: "Arial" })]
}),
spacer(1200),
pageBreak()
);
// ββ TOC placeholder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
h1("Table of Contents"),
new TableOfContents("Table of Contents", {
hyperlink: true,
headingStyleRange: "1-3",
stylesWithLevels: [
{ styleName: "Heading 1", level: 1 },
{ styleName: "Heading 2", level: 2 },
{ styleName: "Heading 3", level: 3 },
],
}),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 0 β THE BIG PICTURE
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 0 β The Big Picture"),
spacer(),
h1("How a RAG System Works End-to-End"),
p(t("Before diving into each component, you need a mental model of the whole pipeline. Every decision you make β how to chunk, which embedding model to pick, how to retrieve β only makes sense in the context of what comes before and after it.", { size: 22 }), { spacing: { before: 80, after: 160 } }),
h2("The Two Phases"),
p(t("A RAG system has two completely separate phases that run at different times:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [4680, 4680],
rows: [
new TableRow({ children: [hCell("INDEXING PHASE (Offline)", C.navy), hCell("QUERY PHASE (Online / Real-time)", C.teal)] }),
new TableRow({ children: [
cell("Runs once (or when docs change). Takes your documents, processes them, and stores them in a vector database. The user never sees this.", { bg: C.lightBlue }),
cell("Runs every time a user asks a question. Takes the question, finds relevant content, and generates an answer.", { bg: C.lightTeal }),
]}),
new TableRow({ children: [
cell("Parse β Chunk β Embed β Store", { bold: true, bg: C.lightBlue }),
cell("Embed query β Retrieve β Rerank β Generate", { bold: true, bg: C.lightTeal }),
]}),
]
}),
spacer(160),
callout([
"Why does this matter? Because the indexing phase determines the ceiling of your system. No matter how good your retrieval logic is, if your chunks are poorly formed, the LLM cannot give a good answer. You have already done the hardest part of indexing β parsing and chunking. The decisions ahead are about embedding and retrieval."
], "tip"),
spacer(),
h2("The Full Pipeline at a Glance"),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [1200, 2400, 5760],
rows: [
new TableRow({ children: [hCell("Step", C.navy, C.white, 1200), hCell("Component", C.navy, C.white, 2400), hCell("What Happens", C.navy, C.white, 5760)] }),
...[
["1", "Parsing", "Raw document (PDF/DOCX) β structured blocks with type, content, heading level. YOU HAVE DONE THIS."],
["2", "Chunking", "Structured blocks β text segments of controlled size, each knowing their section context. YOU HAVE DONE THIS."],
["3", "Embedding (Index)", "Each chunk's text β a vector (array of floats). Similar meaning = similar vector direction."],
["4", "Vector Storage", "Vectors + metadata stored in a vector database. Supports fast similarity search."],
["5", "Query Embedding", "User's question β same embedding model β a query vector."],
["6", "Retrieval", "Query vector compared against all stored vectors. Top-N most similar chunks returned."],
["7", "Reranking", "A second, slower model re-scores the top-N chunks for true relevance. Optional but powerful."],
["8", "Generation", "Retrieved chunks assembled into a prompt. LLM reads them and writes the answer."],
].map(([step, comp, what], i) => new TableRow({ children: [
cell(step, { width: 1200, align: AlignmentType.CENTER, bold: true, bg: i % 2 === 0 ? C.lightGray : C.white }),
cell(comp, { width: 2400, bold: true, bg: i % 2 === 0 ? C.lightGray : C.white }),
cell(what, { width: 5760, bg: i % 2 === 0 ? C.lightGray : C.white }),
]}))
]
}),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 1 β EMBEDDINGS
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 1 β Embeddings"),
spacer(),
h1("Embeddings: Turning Text Into Numbers"),
h2("What Is an Embedding?"),
p(t("An embedding is a list of floating point numbers (a vector) that represents the meaning of a piece of text. The key property is that texts with similar meanings produce vectors that point in similar directions in high-dimensional space.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
p(t("For example:", { size: 22 })),
spacer(60),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [4000, 5360],
rows: [
new TableRow({ children: [hCell("Text", C.navy, C.white, 4000), hCell("What the embedding captures", C.navy, C.white, 5360)] }),
new TableRow({ children: [cell("What is the SLA for urgent tickets?", { width: 4000 }), cell("Intent: asking about time limits for a specific urgency level", { width: 5360 })] }),
new TableRow({ children: [cell("Urgent/Critical - 1 hr", { width: 4000, bg: C.lightGray }), cell("Fact: a time value tied to a urgency category", { width: 5360, bg: C.lightGray })] }),
]
}),
spacer(120),
p(t("These two texts would have very similar vectors even though they use different words β and that is why vector search can find the answer to a question even when the document doesn't use the exact same phrasing as the query.", { size: 22, italic: true })),
spacer(),
h2("Dimensions"),
p(t("The 'size' of an embedding is called its dimensionality β the number of floats in the vector. Common sizes are 384, 768, 1024, and 1536. Higher dimensions generally capture more nuance, but cost more to store and search. You do not choose this β it is fixed by the model you pick.", { size: 22 })),
spacer(),
h2("The Critical Rule: Consistency"),
callout([
"You MUST use the same embedding model at indexing time and at query time. If you embed your chunks with model A and then embed the user's question with model B, the vectors live in different spaces and similarity scores become meaningless. This is one of the most common mistakes in production RAG systems."
], "warning"),
spacer(),
h2("Choosing an Embedding Model"),
p(t("There are two categories of embedding models:", { size: 22 })),
spacer(80),
h3("API-Based Models (Hosted)"),
p(t("You send text to an API endpoint and receive the vector back. You pay per token and do not need to manage any infrastructure.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2800, 1400, 1400, 3760],
rows: [
new TableRow({ children: [hCell("Model", C.navy, C.white, 2800), hCell("Dimensions", C.navy, C.white, 1400), hCell("Max Tokens", C.navy, C.white, 1400), hCell("Best For", C.navy, C.white, 3760)] }),
...([
["OpenAI text-embedding-3-small", "1536", "8191", "General use. Best price/performance ratio. Good starting point."],
["OpenAI text-embedding-3-large", "3072", "8191", "Higher accuracy when quality matters more than cost."],
["Cohere embed-v3", "1024", "512", "Multilingual documents. Has a native 'search' vs 'classification' mode distinction."],
]).map(([m, d, t2, b], i) => new TableRow({ children: [
cell(m, { width: 2800, bg: i%2===0?C.lightGray:C.white }),
cell(d, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(t2, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(b, { width: 3760, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
spacer(160),
h3("Local Models (Self-Hosted)"),
p(t("You download the model weights and run them yourself. Free per-call, but requires compute and maintenance.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2800, 1400, 1400, 3760],
rows: [
new TableRow({ children: [hCell("Model", C.navy, C.white, 2800), hCell("Dimensions", C.navy, C.white, 1400), hCell("Max Tokens", C.navy, C.white, 1400), hCell("Notes", C.navy, C.white, 3760)] }),
...([
["BAAI/bge-large-en-v1.5", "1024", "512", "Strong open-source general model. Common baseline."],
["sentence-transformers/all-MiniLM-L6-v2", "384", "256", "Very fast, very small. Good for prototyping on CPU."],
["nomic-embed-text-v1.5", "768", "8192", "Long context local model. Rare combination."],
]).map(([m, d, t2, b], i) => new TableRow({ children: [
cell(m, { width: 2800, bg: i%2===0?C.lightGray:C.white }),
cell(d, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(t2, { width: 1400, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(b, { width: 3760, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
spacer(160),
h2("Asymmetric vs Symmetric Embedding"),
p(t("This concept is important and often skipped in tutorials. There are two types of embedding tasks:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2000, 3680, 3680],
rows: [
new TableRow({ children: [hCell("Type", C.navy, C.white, 2000), hCell("What it means", C.navy, C.white, 3680), hCell("When to use it", C.navy, C.white, 3680)] }),
new TableRow({ children: [
cell("Symmetric", { width: 2000, bold: true }),
cell("Query and document are the same kind of text. You embed both the same way.", { width: 3680 }),
cell("Semantic similarity search. Finding duplicate content. Clustering.", { width: 3680 }),
]}),
new TableRow({ children: [
cell("Asymmetric", { width: 2000, bold: true, bg: C.lightGray }),
cell("Query is a short question. Document is a long passage that answers it. They need different treatment.", { width: 3680, bg: C.lightGray }),
cell("Question answering. RAG. Finding answers to user questions in documents.", { width: 3680, bg: C.lightGray }),
]}),
]
}),
spacer(120),
callout([
"For RAG, you are almost always doing asymmetric search. Some models (like bge) support this by letting you prepend a prefix like 'Represent this sentence for searching relevant passages:' to the query. Cohere embed-v3 handles this through an explicit input_type parameter ('search_query' vs 'search_document'). Always check your model's docs for this."
], "concept"),
spacer(),
h2("What Gets Embedded: text_to_embed vs raw_text"),
p(t("You have already made the right architectural decision here. The thing you embed is NOT the same as the thing you return to the LLM. Specifically:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 6960],
rows: [
new TableRow({ children: [hCell("Field", C.navy, C.white, 2400), hCell("Content and Purpose", C.navy, C.white, 6960)] }),
new TableRow({ children: [
cell("text_to_embed", { width: 2400, bold: true, bg: C.lightBlue }),
cell("section_path prefix + chunk content. The section path gives the embedding model context about WHERE in the document this content lives. This is what Anthropic calls Contextual Retrieval.", { width: 6960, bg: C.lightBlue }),
]}),
new TableRow({ children: [
cell("raw_text", { width: 2400, bold: true }),
cell("Just the chunk content. This is what you pass to the LLM when generating the answer. Clean, no prefixes.", { width: 6960 }),
]}),
]
}),
spacer(120),
callout([
"Why not embed raw_text alone? Because a chunk like 'Regular Response Email SLA β 2 hr' has almost no context on its own. The embedding model doesn't know this is about SLAs inside a CSS SOP about Champion Petfoods. The section path tells it that, and produces a much better vector."
], "tip"),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 2 β VECTOR DATABASES
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 2 β Vector Databases"),
spacer(),
h1("Vector Databases: Storing and Searching Embeddings"),
h2("What Problem Does a Vector DB Solve?"),
p(t("Once you have embedded all your chunks, you have thousands of vectors (arrays of floats). When a user asks a question, you embed the question and need to find the most similar vectors from your stored set. This is called Approximate Nearest Neighbor (ANN) search.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
p(t("A regular database like Postgres can do this with a plugin (pgvector), but dedicated vector databases build their entire architecture around making ANN search fast, scalable, and feature-rich.", { size: 22 })),
spacer(),
h2("The Anatomy of a Vector DB Record"),
p(t("Every record stored in a vector DB has three parts:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [1800, 2400, 5160],
rows: [
new TableRow({ children: [hCell("Part", C.navy, C.white, 1800), hCell("Type", C.navy, C.white, 2400), hCell("Description", C.navy, C.white, 5160)] }),
new TableRow({ children: [cell("id", { width: 1800, bold: true }), cell("string / UUID", { width: 2400 }), cell("Unique identifier for this record.", { width: 5160 })] }),
new TableRow({ children: [cell("vector", { width: 1800, bold: true, bg: C.lightGray }), cell("float[]", { width: 2400, bg: C.lightGray }), cell("The embedding. This is what gets searched. Never returned to the user.", { width: 5160, bg: C.lightGray })] }),
new TableRow({ children: [cell("payload / metadata", { width: 1800, bold: true }), cell("dict / JSON", { width: 2400 }), cell("Everything else: raw_text, section_path, source_file, parent_elem_index, has_table, etc. This is what you read after a search match.", { width: 5160 })] }),
]
}),
spacer(160),
callout([
"The vector is the key for finding. The payload is the value you actually use. You search by vector, then read the payload of the matches. Think of it like an index in a book β the index helps you find the page, but the page contains the actual content."
], "concept"),
spacer(),
h2("How Similarity Search Works"),
p(t("When you search a vector DB, you are not doing exact matching. You are asking: 'which stored vectors are most similar in direction to my query vector?' This is measured by a similarity metric.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2000, 2800, 4560],
rows: [
new TableRow({ children: [hCell("Metric", C.navy, C.white, 2000), hCell("What it measures", C.navy, C.white, 2800), hCell("When to use", C.navy, C.white, 4560)] }),
new TableRow({ children: [
cell("Cosine Similarity", { width: 2000, bold: true }),
cell("Angle between vectors. Ignores magnitude.", { width: 2800 }),
cell("Best for text. Use this for RAG. Most embedding models are trained with cosine similarity in mind.", { width: 4560 }),
]}),
new TableRow({ children: [
cell("Dot Product", { width: 2000, bold: true, bg: C.lightGray }),
cell("Magnitude Γ cosine. Affected by vector length.", { width: 2800, bg: C.lightGray }),
cell("Use when vectors are normalized (length = 1). Mathematically equivalent to cosine then.", { width: 4560, bg: C.lightGray }),
]}),
new TableRow({ children: [
cell("Euclidean (L2)", { width: 2000, bold: true }),
cell("Straight-line distance between vector tips.", { width: 2800 }),
cell("More common in image similarity. Less common for text RAG.", { width: 4560 }),
]}),
]
}),
spacer(),
h2("Vector DB Options"),
spacer(80),
h3("Fully Dedicated Vector Databases"),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [1600, 1400, 2360, 3200, 800],
rows: [
new TableRow({ children: [hCell("DB", C.navy, C.white, 1600), hCell("Hosting", C.navy, C.white, 1400), hCell("Key Strength", C.navy, C.white, 2360), hCell("Best For", C.navy, C.white, 3200), hCell("Free?", C.navy, C.white, 800)] }),
...([
["Qdrant", "Local / Cloud", "Rich filtering on payload. Sparse+dense hybrid built-in.", "Production RAG. Local dev. Full control.", "Yes"],
["Pinecone", "Cloud only", "Fully managed. Very easy ops.", "Teams that want zero infra management.", "Paid"],
["Weaviate", "Local / Cloud", "GraphQL interface. Built-in modules.", "Complex data relationships.", "Yes"],
["Chroma", "Local", "Extremely simple API. Great for prototyping.", "Local dev and experimentation.", "Yes"],
["Milvus", "Local / Cloud", "High scale. Billion-vector support.", "Large enterprise deployments.", "Yes"],
]).map(([db, host, str, best, free], i) => new TableRow({ children: [
cell(db, { width: 1600, bold: true, bg: i%2===0?C.lightGray:C.white }),
cell(host, { width: 1400, bg: i%2===0?C.lightGray:C.white }),
cell(str, { width: 2360, bg: i%2===0?C.lightGray:C.white }),
cell(best, { width: 3200, bg: i%2===0?C.lightGray:C.white }),
cell(free, { width: 800, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
spacer(160),
h3("Traditional Databases with Vector Extensions"),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2000, 7360],
rows: [
new TableRow({ children: [hCell("Option", C.navy, C.white, 2000), hCell("Notes", C.navy, C.white, 7360)] }),
new TableRow({ children: [cell("Postgres + pgvector", { width: 2000, bold: true }), cell("Good choice if you already use Postgres. Vector search is slower than dedicated DBs at large scale, but fine for most business RAG applications under ~1M chunks.", { width: 7360 })] }),
new TableRow({ children: [cell("Redis + RediSearch", { width: 2000, bold: true, bg: C.lightGray }), cell("Fast in-memory option. Good if you already have Redis and latency is critical.", { width: 7360, bg: C.lightGray })] }),
new TableRow({ children: [cell("Elasticsearch", { width: 2000, bold: true }), cell("Added dense vector support. Better for hybrid search (already has strong BM25). Worth considering if you need full-text + vector in one system.", { width: 7360 })] }),
]
}),
spacer(160),
callout([
"For your use case (SOP documents, moderate scale), Qdrant running locally in Docker is the right choice. It has built-in support for hybrid search (dense + sparse in one index), rich payload filtering, and no cloud dependency. When you are ready to deploy, it also has a managed cloud tier."
], "tip"),
spacer(),
h2("Collections and Indexes"),
p(t("In a vector DB, a 'collection' (Qdrant term) or 'index' (Pinecone term) is a named container for a set of vectors. All vectors in a collection must have the same dimensionality. You typically create one collection per document set or per embedding model.", { size: 22 })),
spacer(),
h2("Payload Filtering"),
p(t("One of the most powerful features of dedicated vector DBs is the ability to filter by payload fields during search β not after. This means you can say 'find the top 5 chunks most similar to my query, but only from source_file = X and has_table = true'. The filter is applied during the ANN search, not on the full result set.", { size: 22 })),
spacer(80),
callout([
"This is why storing rich metadata (source_file, section_path, parent_elem_index, has_table) on every chunk matters. These fields become first-class filter parameters at query time. For example, if the user says 'in the Champion Petfoods SOP' you can filter to just that file."
], "concept"),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 3 β RETRIEVAL
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 3 β Retrieval"),
spacer(),
h1("Retrieval: Finding the Right Chunks"),
p(t("Retrieval is the step that connects the user's question to your stored knowledge. The goal is simple: given a query, return the most relevant chunks. But there are multiple strategies with meaningfully different strengths.", { size: 22 }), { spacing: { before: 80, after: 160 } }),
h2("Strategy 1: Dense Retrieval (Vector Search)"),
p(t("This is what most people mean when they say 'RAG retrieval'. You embed the query and find the nearest vectors in your collection by cosine similarity.", { size: 22 })),
spacer(80),
callout([
[t("Strength: ", { bold: true, size: 20 }), t("Finds semantically similar content even when the wording is completely different. Query: 'how long do I have to respond to a critical issue' will find the chunk containing 'Urgent/Critical - 1 hr' because the meanings are similar.", { size: 20 })],
[t("Weakness: ", { bold: true, size: 20 }), t("Fails at exact keyword matching. If someone searches for a specific code, product name, or ID (like 'project code 79478-CA'), vector search can miss it because the meaning space doesn't preserve exact strings well.", { size: 20 })],
], "note"),
spacer(),
h2("Strategy 2: Sparse Retrieval (BM25 / Keyword Search)"),
p(t("BM25 is the algorithm that powers traditional keyword search engines. It scores documents based on term frequency β how often query words appear in the chunk, weighted by how rare those words are across all chunks.", { size: 22 })),
spacer(80),
callout([
[t("Strength: ", { bold: true, size: 20 }), t("Exact and near-exact keyword matching. Perfect for product codes, person names, specific IDs, and domain jargon that might not appear often in embedding training data.", { size: 20 })],
[t("Weakness: ", { bold: true, size: 20 }), t("Completely blind to semantics. 'Urgent ticket response time' would score zero against a chunk that says 'Critical issue SLA' even though they mean the same thing.", { size: 20 })],
], "note"),
spacer(),
h2("Strategy 3: Hybrid Retrieval (Dense + Sparse)"),
p(t("The current production standard. Run both dense and sparse retrieval in parallel, then merge the results. The merging step is called Reciprocal Rank Fusion (RRF).", { size: 22 })),
spacer(80),
h3("Reciprocal Rank Fusion (RRF)"),
p(t("RRF merges two ranked lists (one from dense, one from sparse) into a single ranked list without needing to normalize scores across different scoring systems. The formula for each document's RRF score is:", { size: 22 })),
spacer(60),
p([t("score(d) = Ξ£ 1 / (k + rank(d))", { bold: true, size: 24, color: C.navy })], { align: AlignmentType.CENTER }),
spacer(60),
p(t("Where k is a constant (usually 60) and rank(d) is the document's position in each ranked list. A document that ranks #1 in both lists will score highest. A document that ranks #20 in one list but doesn't appear in the other will score lower than one that ranks #5 in both.", { size: 22 })),
spacer(120),
callout([
"RRF is elegant because it is rank-based, not score-based. You do not need to worry that a BM25 score of 12.4 and a cosine similarity of 0.87 are on completely different scales. You only care about position in each list."
], "concept"),
spacer(),
h2("Strategy 4: Small-to-Large (Parent Document) Retrieval"),
p(t("This is the pattern you have already designed for with your parent_elem_index field. The idea:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [4680, 4680],
rows: [
new TableRow({ children: [hCell("Small chunks (indexed)", C.navy), hCell("Large chunks (returned to LLM)", C.teal)] }),
new TableRow({ children: [
cell("128β256 tokens. Fine-grained. Match questions precisely because each chunk is about one specific thing.", { bg: C.lightBlue }),
cell("The full section (all siblings under the same parent). 400β800 tokens. Give the LLM full context to answer correctly.", { bg: C.lightTeal }),
]}),
]
}),
spacer(120),
p(t("Why both? Because precision and context are in tension. A small chunk matches questions precisely (high recall), but a small chunk often lacks enough context for the LLM to give a complete answer. By returning the full section, the LLM sees everything around the matched chunk.", { size: 22 })),
spacer(80),
callout([
"In your implementation: vector search returns a chunk with parent_elem_index = 1. You then fetch ALL chunks where parent_elem_index == 1, sort by chunk_index, and join them. This reconstructed section goes into the LLM prompt β not the individual matched chunk."
], "tip"),
spacer(),
h2("Strategy 5: Multi-Query Retrieval"),
p(t("A user's question is a single phrasing of their intent, but the answer might be stored under different phrasing. Multi-query retrieval generates multiple reformulations of the question using an LLM, runs retrieval for each, and merges the results.", { size: 22 })),
spacer(80),
p([t("Example: ", { bold: true, size: 22 }), t("User asks 'who do I contact for IT problems?'. Multi-query generates:", { size: 22 })]),
bullet("'IT support contact information'"),
bullet("'technology helpdesk details'"),
bullet("'helpdesk URL for technical issues'"),
p(t("Each gets retrieved separately, then deduplicated and merged. This significantly improves recall for queries that could be phrased many ways.", { size: 22 })),
spacer(),
h2("top_k: How Many Chunks to Retrieve?"),
p(t("Every retrieval call asks for the top-k most similar chunks. This is a key hyperparameter:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [1600, 2800, 4960],
rows: [
new TableRow({ children: [hCell("top_k value", C.navy, C.white, 1600), hCell("Trade-off", C.navy, C.white, 2800), hCell("Risk", C.navy, C.white, 4960)] }),
new TableRow({ children: [cell("3β5", { width: 1600, bold: true }), cell("Fast, cheap, focused", { width: 2800 }), cell("May miss relevant chunks if the answer is spread across multiple sections.", { width: 4960 })] }),
new TableRow({ children: [cell("10β20", { width: 1600, bold: true, bg: C.lightGray }), cell("More coverage", { width: 2800, bg: C.lightGray }), cell("LLM context window fills up faster. More noise if retrieval quality is low.", { width: 4960, bg: C.lightGray })] }),
new TableRow({ children: [cell("50+", { width: 1600, bold: true }), cell("High recall", { width: 2800 }), cell("Context rot: LLMs perform worse when the context is very long and the answer is buried.", { width: 4960 })] }),
]
}),
spacer(120),
callout([
"Practical starting point: retrieve top 10 from vector search, rerank them, pass top 3β5 to the LLM. The reranker is what lets you safely retrieve more candidates without sending all of them to the LLM."
], "tip"),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 4 β RERANKING
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 4 β Reranking"),
spacer(),
h1("Reranking: Sorting Candidates by True Relevance"),
h2("Why Retrieval Alone Is Not Enough"),
p(t("Vector search is fast but approximate. It finds chunks that are in the same semantic neighborhood as your query, but the ranking within those results is imprecise. The top result is not always the most relevant β sometimes the 4th or 5th result is actually a better answer.", { size: 22 }), { spacing: { before: 80, after: 80 } }),
p(t("Reranking solves this by applying a more expensive but more accurate model to re-score the top-N candidates from retrieval.", { size: 22 })),
spacer(),
h2("How a Reranker Works"),
p(t("A reranker is a cross-encoder model. Unlike a bi-encoder embedding model (which encodes query and document independently), a cross-encoder takes the query and a document together as a single input and outputs a relevance score.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [3000, 3180, 3180],
rows: [
new TableRow({ children: [hCell("", C.navy, C.white, 3000), hCell("Bi-encoder (Embedding model)", C.navy, C.white, 3180), hCell("Cross-encoder (Reranker)", C.navy, C.white, 3180)] }),
new TableRow({ children: [cell("How it works", { width: 3000, bold: true }), cell("Encodes query and document independently. Compares vectors.", { width: 3180 }), cell("Reads query AND document together. Produces a single relevance score.", { width: 3180 })] }),
new TableRow({ children: [cell("Speed", { width: 3000, bold: true, bg: C.lightGray }), cell("Fast. Can search millions of vectors in milliseconds.", { width: 3180, bg: C.lightGray }), cell("Slow. Must process query + each candidate individually.", { width: 3180, bg: C.lightGray })] }),
new TableRow({ children: [cell("Accuracy", { width: 3000, bold: true }), cell("Good. Misses nuance because query and doc context are separate.", { width: 3180 }), cell("Better. Sees the interaction between query and document explicitly.", { width: 3180 })] }),
new TableRow({ children: [cell("Use case", { width: 3000, bold: true, bg: C.lightGray }), cell("First-stage retrieval. Fast candidate selection.", { width: 3180, bg: C.lightGray }), cell("Second-stage rescoring. Applied only to top-N candidates.", { width: 3180, bg: C.lightGray })] }),
]
}),
spacer(160),
callout([
"The standard pipeline: retrieve top 20 candidates with fast vector search β rerank to top 5 with a cross-encoder β pass top 5 to the LLM. You get the recall of retrieving 20 and the precision of sending only the best 5."
], "tip"),
spacer(),
h2("Reranker Options"),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 1600, 5360],
rows: [
new TableRow({ children: [hCell("Model", C.navy, C.white, 2400), hCell("Type", C.navy, C.white, 1600), hCell("Notes", C.navy, C.white, 5360)] }),
...([
["Cohere Rerank 3", "API", "Best quality. Supports 100+ languages. Accepts up to 10,000 token documents. Pay per call."],
["BGE-Reranker-Large", "Local", "Strong open-source option. Run on your own GPU/CPU. BAAI family."],
["cross-encoder/ms-marco-MiniLM", "Local", "Small and fast. Good for CPU-only environments. Slightly lower quality."],
["Jina Reranker v2", "API / Local", "Good multilingual support. Can run locally or via API."],
]).map(([m, type, notes], i) => new TableRow({ children: [
cell(m, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
cell(type, { width: 1600, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(notes, { width: 5360, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
spacer(),
h2("Is Reranking Always Necessary?"),
p(t("No. It is a quality-vs-latency trade-off. Add it when:", { size: 22 })),
bullet("Your retrieval is returning the right sections but not always in the right order."),
bullet("Your queries are complex or multi-part."),
bullet("Accuracy matters more than response time."),
spacer(80),
p(t("Skip it initially. Build without reranking first, measure quality, then add it when you identify it as the bottleneck.", { size: 22 })),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 5 β GENERATION
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 5 β Generation"),
spacer(),
h1("Generation: From Retrieved Chunks to Final Answer"),
h2("The Prompt Structure"),
p(t("The generation step takes the retrieved (and optionally reranked) chunks, assembles them into a prompt, and calls an LLM. The prompt structure is the main lever you control here.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2000, 7360],
rows: [
new TableRow({ children: [hCell("Prompt Section", C.navy, C.white, 2000), hCell("Content", C.navy, C.white, 7360)] }),
new TableRow({ children: [cell("System prompt", { width: 2000, bold: true }), cell("Instructions for the LLM. Role, constraints, behavior. Example: 'You are a helpful assistant. Answer only using the provided context. If the answer is not in the context, say so explicitly.'", { width: 7360 })] }),
new TableRow({ children: [cell("Context block", { width: 2000, bold: true, bg: C.lightGray }), cell("The retrieved chunks, formatted with their section paths as headers. Each chunk is clearly labeled with its source section so the LLM can cite it.", { width: 7360, bg: C.lightGray })] }),
new TableRow({ children: [cell("User question", { width: 2000, bold: true }), cell("The original user query, unchanged.", { width: 7360 })] }),
]
}),
spacer(),
h2("Context Rot: A Real Problem"),
p(t("Research has consistently shown that LLMs perform worse when the relevant information is buried in the middle of a long context. Performance is highest when the relevant chunk is at the beginning or end of the context block. This is called 'lost in the middle'.", { size: 22 })),
spacer(80),
callout([
"Practical implication: after reranking, order your chunks so the highest-scoring ones appear first in the context block, not in the middle. Also: do not send more context than necessary. 3 high-quality chunks often outperform 10 mediocre ones."
], "warning"),
spacer(),
h2("Grounding and Hallucination Prevention"),
p(t("The system prompt is your primary tool for keeping the LLM grounded in your documents. Key instructions to include:", { size: 22 })),
bullet("'Answer only based on the provided context documents.'"),
bullet("'If the context does not contain enough information to answer, say: I could not find this in the provided documents.'"),
bullet("'Do not make up information. Do not use your general knowledge.'"),
bullet("'Cite the section you are drawing from when possible.'"),
spacer(80),
callout([
"No prompt instruction eliminates hallucination entirely, but clear grounding instructions reduce it significantly. The more specific you are about what the LLM should do when it doesn't know, the more reliable the behavior."
], "note"),
spacer(),
h2("LLM Options for Generation"),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 1600, 1360, 4000],
rows: [
new TableRow({ children: [hCell("Model", C.navy, C.white, 2400), hCell("Context Window", C.navy, C.white, 1600), hCell("Cost", C.navy, C.white, 1360), hCell("Notes", C.navy, C.white, 4000)] }),
...([
["Claude Sonnet 4", "200K tokens", "Medium", "Excellent instruction following. Great for RAG. Strong at staying grounded in context."],
["GPT-4o", "128K tokens", "Medium", "Strong all-around. Good at following complex system prompts."],
["GPT-4o mini", "128K tokens", "Low", "Good quality at lower cost. Reasonable grounding."],
["Llama 3.1 70B", "128K tokens", "Self-host", "Open source. Good quality. Requires GPU infrastructure."],
]).map(([m, ctx, cost, notes], i) => new TableRow({ children: [
cell(m, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
cell(ctx, { width: 1600, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(cost, { width: 1360, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(notes, { width: 4000, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 6 β EVALUATION
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 6 β Evaluation"),
spacer(),
h1("Evaluation: Knowing If Your RAG System Is Working"),
h2("Why Evaluation Is Non-Negotiable"),
p(t("Without evaluation, you are flying blind. You might swap embedding models, change chunk sizes, or add reranking β and have no way of knowing if things got better or worse. A simple evaluation harness built early saves enormous time later.", { size: 22 })),
spacer(),
h2("The Three Things That Can Fail"),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 3480, 3480],
rows: [
new TableRow({ children: [hCell("Failure Mode", C.navy, C.white, 2400), hCell("Symptom", C.navy, C.white, 3480), hCell("Root Cause", C.navy, C.white, 3480)] }),
new TableRow({ children: [cell("Retrieval failure", { width: 2400, bold: true }), cell("The right chunk never made it into the context at all.", { width: 3480 }), cell("Chunk boundaries wrong. Embedding model poor fit. top_k too low.", { width: 3480 })] }),
new TableRow({ children: [cell("Context failure", { width: 2400, bold: true, bg: C.lightGray }), cell("The right chunk was retrieved but the LLM couldn't use it properly.", { width: 3480, bg: C.lightGray }), cell("Chunk too small, context rot, section path missing.", { width: 3480, bg: C.lightGray })] }),
new TableRow({ children: [cell("Generation failure", { width: 2400, bold: true }), cell("Context was correct but the answer was wrong or hallucinated.", { width: 3480 }), cell("Weak system prompt, wrong LLM, temperature too high.", { width: 3480 })] }),
]
}),
spacer(160),
callout([
"When your RAG agent gives a wrong answer, always ask: was the right chunk retrieved? Check the context that was actually passed to the LLM before blaming generation. Most of the time, retrieval is the culprit."
], "warning"),
spacer(),
h2("RAGAS: The Standard Evaluation Framework"),
p(t("RAGAS (RAG Assessment) is an open-source framework that measures RAG quality without requiring hand-labeled ground truth. It uses an LLM to evaluate the quality of retrieved context and generated answers.", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 6960],
rows: [
new TableRow({ children: [hCell("Metric", C.navy, C.white, 2400), hCell("What it measures", C.navy, C.white, 6960)] }),
new TableRow({ children: [cell("Context Precision", { width: 2400, bold: true }), cell("Of the chunks retrieved, how many were actually relevant? High precision = low noise in context.", { width: 6960 })] }),
new TableRow({ children: [cell("Context Recall", { width: 2400, bold: true, bg: C.lightGray }), cell("Of all the relevant chunks in the knowledge base, how many were retrieved? High recall = not missing important chunks.", { width: 6960, bg: C.lightGray })] }),
new TableRow({ children: [cell("Faithfulness", { width: 2400, bold: true }), cell("Is the generated answer factually consistent with the retrieved context? Measures hallucination.", { width: 6960 })] }),
new TableRow({ children: [cell("Answer Relevancy", { width: 2400, bold: true, bg: C.lightGray }), cell("Does the generated answer actually address the question asked? Measures completeness.", { width: 6960, bg: C.lightGray })] }),
]
}),
spacer(160),
h2("Minimum Viable Evaluation (Before Using RAGAS)"),
p(t("Before setting up a full evaluation framework, do this manually with 10β15 questions from your actual documents:", { size: 22 })),
bullet("Write 10 questions whose answers exist clearly in your SOP documents."),
bullet("For each question, inspect the retrieved chunks β did the right section appear?"),
bullet("For each question, read the generated answer β is it correct? Does it hallucinate?"),
bullet("Classify each failure as retrieval failure, context failure, or generation failure."),
bullet("The failure distribution tells you exactly what to fix first."),
spacer(80),
callout([
"This manual inspection of 10 questions will teach you more about your system than any automated metric. Do it before writing more code."
], "tip"),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 7 β DECISION GUIDE
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 7 β Your Decision Guide"),
spacer(),
h1("Making Decisions for Your Specific System"),
h2("What You Have Already Built"),
callout([
"Parser: DOCX β structured blocks with type, heading_level, elem_index, page_index.",
"Parent chain: every block has parent_id pointing to its heading ancestor.",
"Section-aware chunks: blocks grouped by parent, section_path prepended to text_to_embed.",
"Table handling: oversized tables get LLM summaries; summary is embedded, raw table is stored.",
"Two-field design: text_to_embed (goes to embedding model) vs raw_text (goes to LLM).",
], "tip"),
spacer(),
h2("Decision 1: Which Embedding Model?"),
p(t("Recommendation: Start with OpenAI text-embedding-3-small.", { size: 22, bold: true })),
bullet("It has an 8191 token max β your chunks (under 500 tokens + section path prefix) are well within limits."),
bullet("The 1536-dimensional vectors offer strong quality for English-language business documents."),
bullet("It is the most common choice, meaning most tutorials, examples, and integrations assume it."),
bullet("If you later need better quality: upgrade to text-embedding-3-large (same API, just a model name change)."),
bullet("If you later need no API dependency: switch to BAAI/bge-large-en-v1.5 locally."),
spacer(80),
callout(["Do not over-engineer this choice. The embedding model is the easiest thing to swap later. Chunk quality matters far more, and you have already invested there."], "note"),
spacer(),
h2("Decision 2: Which Vector Database?"),
p(t("Recommendation: Qdrant running locally via Docker.", { size: 22, bold: true })),
bullet("Free, no account required, runs on your machine."),
bullet("Supports hybrid search (dense + sparse BM25) natively β you will want this."),
bullet("Rich payload filtering β you can filter by source_file, has_table, parent_elem_index."),
bullet("Same API works when you eventually move to Qdrant Cloud."),
spacer(),
h2("Decision 3: Which Retrieval Strategy?"),
p(t("Recommendation: Start with dense retrieval only, then add BM25 hybrid once baseline works.", { size: 22, bold: true })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2000, 7360],
rows: [
new TableRow({ children: [hCell("Phase", C.navy, C.white, 2000), hCell("What to implement", C.navy, C.white, 7360)] }),
new TableRow({ children: [cell("Phase 1", { width: 2000, bold: true }), cell("Dense vector search only. top_k = 10. Return parent sections (small-to-large). No reranker.", { width: 7360 })] }),
new TableRow({ children: [cell("Phase 2", { width: 2000, bold: true, bg: C.lightGray }), cell("Add BM25 sparse search. Merge with RRF. Especially valuable for queries with specific names, codes, email addresses.", { width: 7360, bg: C.lightGray })] }),
new TableRow({ children: [cell("Phase 3", { width: 2000, bold: true }), cell("Add Cohere Rerank. Retrieve top 20, rerank to top 5, pass top 5 to LLM.", { width: 7360 })] }),
]
}),
spacer(),
h2("Decision 4: Which LLM for Generation?"),
p(t("Recommendation: Claude Sonnet 4 or GPT-4o. Both follow grounding instructions well.", { size: 22, bold: true })),
bullet("Set temperature to 0 for factual Q&A β you want deterministic, grounded answers."),
bullet("Write a strict system prompt that forbids answering outside the provided context."),
bullet("Include section paths as headers in your context block so the LLM can cite sources."),
spacer(),
h2("The Build Order"),
p(t("Given where you are right now, here is the recommended sequence:", { size: 22 })),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [800, 2800, 5760],
rows: [
new TableRow({ children: [hCell("#", C.navy, C.white, 800), hCell("Task", C.navy, C.white, 2800), hCell("Why this order", C.navy, C.white, 5760)] }),
...([
["1", "Embed chunks + store in Qdrant", "Nothing else works until you have searchable vectors."],
["2", "Build basic dense retrieval + parent expansion", "This alone can answer most questions. Establish your baseline."],
["3", "Evaluate manually with 10 real questions", "Find out where you are failing before adding complexity."],
["4", "Add BM25 hybrid + RRF", "Fixes failures on specific names, codes, and exact terms."],
["5", "Add reranker", "Fixes ordering failures β right chunks retrieved but ranked wrong."],
["6", "Add multi-query expansion", "Fixes cases where phrasing mismatch causes miss."],
["7", "Set up RAGAS evaluation", "Automate quality tracking so regressions are caught."],
]).map(([n, task, why], i) => new TableRow({ children: [
cell(n, { width: 800, bold: true, align: AlignmentType.CENTER, bg: i%2===0?C.lightGray:C.white }),
cell(task, { width: 2800, bold: true, bg: i%2===0?C.lightGray:C.white }),
cell(why, { width: 5760, bg: i%2===0?C.lightGray:C.white }),
]}))
]
}),
spacer(160),
callout([
"The biggest mistake engineers make in RAG: adding complexity (reranking, multi-query, graph RAG) before establishing and measuring a baseline. Build the simplest thing that could work. Measure it. Then add exactly the complexity that fixes the specific failure mode you observe."
], "warning"),
pageBreak()
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PART 8 β GLOSSARY
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
children.push(
sectionBanner("PART 8 β Glossary"),
spacer(),
h1("Glossary of Key Terms"),
spacer(80),
new Table({
width: { size: 9360, type: WidthType.DXA },
columnWidths: [2400, 6960],
rows: [
new TableRow({ children: [hCell("Term", C.navy, C.white, 2400), hCell("Definition", C.navy, C.white, 6960)] }),
...([
["ANN (Approximate Nearest Neighbor)", "An algorithm for finding vectors that are close to a query vector, without checking every single stored vector. Trades a small amount of accuracy for large speed gains."],
["BM25", "Best Match 25. A keyword-based ranking algorithm. Scores documents by term frequency weighted by inverse document frequency (IDF). The basis of traditional search engines."],
["Bi-encoder", "An embedding model architecture where query and document are encoded independently, then compared by vector similarity. Fast. Used for first-stage retrieval."],
["Chunking", "The process of splitting documents into smaller text segments for embedding. The quality of chunking is the single largest determinant of RAG quality."],
["Contextual Retrieval", "Anthropic's technique of prepending a chunk's document context (section path, summary) to the chunk text before embedding, so the vector captures where the chunk lives in the document."],
["Context Rot", "The phenomenon where LLM performance degrades when relevant information is buried in the middle of a long context. Also called 'lost in the middle'."],
["Cosine Similarity", "A measure of the angle between two vectors. Range: -1 to 1. Used as the similarity metric in most RAG vector searches. 1 = identical direction, 0 = orthogonal, -1 = opposite."],
["Cross-encoder", "A reranker model architecture that takes query and document as a single input and outputs a relevance score. More accurate than bi-encoders but too slow for first-stage retrieval."],
["Dense Retrieval", "Retrieval based on vector similarity (embeddings). Finds semantically similar content even when wording differs."],
["Embedding", "A vector (list of floats) that represents the meaning of a text. Produced by an embedding model. Similar texts produce similar vectors."],
["Faithfulness", "A RAGAS metric measuring whether a generated answer is factually grounded in the retrieved context (i.e., not hallucinated)."],
["Hallucination", "When an LLM generates plausible-sounding but factually incorrect information not supported by its context."],
["Hybrid Retrieval", "Combining dense (vector) and sparse (BM25) retrieval results into a single ranked list, usually via Reciprocal Rank Fusion."],
["Indexing", "The offline phase of RAG: parsing, chunking, embedding, and storing documents in a vector database."],
["Payload", "The metadata stored alongside a vector in a vector DB. Contains raw_text, section_path, source_file, etc. Retrieved after a search match."],
["RAGAS", "RAG Assessment. An open-source framework for evaluating RAG systems using LLM-based metrics (context precision, recall, faithfulness, answer relevancy)."],
["Reciprocal Rank Fusion (RRF)", "An algorithm for merging ranked lists from different retrieval systems without needing to normalize scores. Combines dense and sparse results by rank position."],
["Reranking", "A second-stage scoring step that re-orders the top-N retrieved candidates using a cross-encoder model for higher accuracy before passing to the LLM."],
["Section Path", "The full heading ancestry of a chunk expressed as a breadcrumb (e.g., 'SOP Title > Section Name'). Prepended to chunks before embedding for contextual retrieval."],
["Small-to-Large Retrieval", "A pattern where small chunks are indexed for precise retrieval, but the full parent section is fetched and sent to the LLM for generation."],
["Sparse Retrieval", "Retrieval based on keyword matching (BM25). Excels at exact term matching but blind to semantic similarity."],
["top_k", "The number of candidates to retrieve from the vector database. A hyperparameter that controls the recall-precision trade-off."],
["Vector Database", "A database optimized for storing and searching high-dimensional vectors. Supports approximate nearest neighbor search and payload filtering."],
]).map(([term, def], i) => new TableRow({ children: [
cell(term, { width: 2400, bold: true, bg: i%2===0?C.lightGray:C.white }),
cell(def, { width: 6960, bg: i%2===0?C.lightGray:C.white }),
]}))
]
})
);
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// ASSEMBLE DOCUMENT
// ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const doc = new Document({
numbering: {
config: [
{
reference: "bullets",
levels: [
{ level: 0, format: LevelFormat.BULLET, text: "β’", alignment: AlignmentType.LEFT,
style: { paragraph: { indent: { left: 720, hanging: 360 } }, run: { font: "Arial", size: 22 } } },
{ level: 1, format: LevelFormat.BULLET, text: "β¦", alignment: AlignmentType.LEFT,
style: { paragraph: { indent: { left: 1080, hanging: 360 } }, run: { font: "Arial", size: 22 } } },
]
}
]
},
styles: {
default: { document: { run: { font: "Arial", size: 22, color: C.black } } },
paragraphStyles: [
{ id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true,
run: { size: 36, bold: true, color: C.navy, font: "Arial" },
paragraph: { spacing: { before: 360, after: 160 }, outlineLevel: 0 } },
{ id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true,
run: { size: 28, bold: true, color: C.blue, font: "Arial" },
paragraph: { spacing: { before: 280, after: 120 }, outlineLevel: 1 } },
{ id: "Heading3", name: "Heading 3", basedOn: "Normal", next: "Normal", quickFormat: true,
run: { size: 24, bold: true, color: C.teal, font: "Arial" },
paragraph: { spacing: { before: 200, after: 80 }, outlineLevel: 2 } },
]
},
sections: [{
properties: {
page: {
size: { width: 12240, height: 15840 },
margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 }
}
},
children
}]
});
Packer.toBuffer(doc).then(buf => {
fs.writeFileSync("/mnt/user-data/outputs/RAG_System_Design_Study_Guide.docx", buf);
console.log("Done");
}); |