Spaces:
Running
Running
| [ | |
| { | |
| "start": 0.04, | |
| "text": "hi everyone so in this video I'd like us" | |
| }, | |
| { | |
| "start": 2.04, | |
| "text": "to cover the process of tokenization in" | |
| }, | |
| { | |
| "start": 4.08, | |
| "text": "large language models now you see here" | |
| }, | |
| { | |
| "start": 6.44, | |
| "text": "that I have a set face and that's" | |
| }, | |
| { | |
| "start": 8.28, | |
| "text": "because uh tokenization is my least" | |
| }, | |
| { | |
| "start": 10.32, | |
| "text": "favorite part of working with large" | |
| }, | |
| { | |
| "start": 11.679, | |
| "text": "language models but unfortunately it is" | |
| }, | |
| { | |
| "start": 13.48, | |
| "text": "necessary to understand in some detail" | |
| }, | |
| { | |
| "start": 15.519, | |
| "text": "because it it is fairly hairy gnarly and" | |
| }, | |
| { | |
| "start": 17.6, | |
| "text": "there's a lot of hidden foot guns to be" | |
| }, | |
| { | |
| "start": 19.48, | |
| "text": "aware of and a lot of oddness with large" | |
| }, | |
| { | |
| "start": 21.84, | |
| "text": "language models typically traces back to" | |
| }, | |
| { | |
| "start": 24.599, | |
| "text": "tokenization so what is" | |
| }, | |
| { | |
| "start": 26.64, | |
| "text": "tokenization now in my previous video" | |
| }, | |
| { | |
| "start": 28.92, | |
| "text": "Let's Build GPT from scratch uh we" | |
| }, | |
| { | |
| "start": 31.56, | |
| "text": "actually already did tokenization but we" | |
| }, | |
| { | |
| "start": 33.48, | |
| "text": "did a very naive simple version of" | |
| }, | |
| { | |
| "start": 35.8, | |
| "text": "tokenization so when you go to the" | |
| }, | |
| { | |
| "start": 37.48, | |
| "text": "Google colab for that video uh you see" | |
| }, | |
| { | |
| "start": 40.559, | |
| "text": "here that we loaded our training set and" | |
| }, | |
| { | |
| "start": 43.2, | |
| "text": "our training set was this uh Shakespeare" | |
| }, | |
| { | |
| "start": 45.52, | |
| "text": "uh data set now in the beginning the" | |
| }, | |
| { | |
| "start": 48.12, | |
| "text": "Shakespeare data set is just a large" | |
| }, | |
| { | |
| "start": 49.76, | |
| "text": "string in Python it's just text and so" | |
| }, | |
| { | |
| "start": 52.44, | |
| "text": "the question is how do we plug text into" | |
| }, | |
| { | |
| "start": 54.84, | |
| "text": "large language models and in this case" | |
| }, | |
| { | |
| "start": 58.079, | |
| "text": "here we created a vocabulary of 65" | |
| }, | |
| { | |
| "start": 61.44, | |
| "text": "possible characters that we saw occur in" | |
| }, | |
| { | |
| "start": 63.96, | |
| "text": "this string these were the possible" | |
| }, | |
| { | |
| "start": 65.799, | |
| "text": "characters and we saw that there are 65" | |
| }, | |
| { | |
| "start": 67.96, | |
| "text": "of them and then we created a a lookup" | |
| }, | |
| { | |
| "start": 70.64, | |
| "text": "table for converting from every possible" | |
| }, | |
| { | |
| "start": 73.4, | |
| "text": "character a little string piece into a" | |
| }, | |
| { | |
| "start": 76.32, | |
| "text": "token an" | |
| }, | |
| { | |
| "start": 77.759, | |
| "text": "integer so here for example we tokenized" | |
| }, | |
| { | |
| "start": 80.52, | |
| "text": "the string High there and we received" | |
| }, | |
| { | |
| "start": 83.28, | |
| "text": "this sequence of" | |
| }, | |
| { | |
| "start": 84.72, | |
| "text": "tokens and here we took the first 1,000" | |
| }, | |
| { | |
| "start": 87.6, | |
| "text": "characters of our data set and we" | |
| }, | |
| { | |
| "start": 89.92, | |
| "text": "encoded it into tokens and because it is" | |
| }, | |
| { | |
| "start": 92.56, | |
| "text": "this is character level we received" | |
| }, | |
| { | |
| "start": 94.64, | |
| "text": "1,000 tokens in a sequence so token 18" | |
| }, | |
| { | |
| "start": 98.96, | |
| "text": "47" | |
| }, | |
| { | |
| "start": 100.119, | |
| "text": "Etc now later we saw that the way we" | |
| }, | |
| { | |
| "start": 103.439, | |
| "text": "plug these tokens into the language" | |
| }, | |
| { | |
| "start": 105.64, | |
| "text": "model is by using an embedding" | |
| }, | |
| { | |
| "start": 108.479, | |
| "text": "table and so basically if we have 65" | |
| }, | |
| { | |
| "start": 111.479, | |
| "text": "possible tokens then this embedding" | |
| }, | |
| { | |
| "start": 113.479, | |
| "text": "table is going to have 65 rows and" | |
| }, | |
| { | |
| "start": 116.439, | |
| "text": "roughly speaking we're taking the" | |
| }, | |
| { | |
| "start": 118.159, | |
| "text": "integer associated with every single" | |
| }, | |
| { | |
| "start": 119.799, | |
| "text": "sing Le token we're using that as a" | |
| }, | |
| { | |
| "start": 121.52, | |
| "text": "lookup into this table and we're" | |
| }, | |
| { | |
| "start": 124.039, | |
| "text": "plucking out the corresponding row and" | |
| }, | |
| { | |
| "start": 126.479, | |
| "text": "this row is a uh is trainable parameters" | |
| }, | |
| { | |
| "start": 129.36, | |
| "text": "that we're going to train using back" | |
| }, | |
| { | |
| "start": 130.479, | |
| "text": "propagation and this is the vector that" | |
| }, | |
| { | |
| "start": 132.879, | |
| "text": "then feeds into the Transformer um and" | |
| }, | |
| { | |
| "start": 135.36, | |
| "text": "that's how the Transformer Ser of" | |
| }, | |
| { | |
| "start": 136.56, | |
| "text": "perceives every single" | |
| }, | |
| { | |
| "start": 138.12, | |
| "text": "token so here we had a very naive" | |
| }, | |
| { | |
| "start": 141.28, | |
| "text": "tokenization process that was a" | |
| }, | |
| { | |
| "start": 143.12, | |
| "text": "character level tokenizer but in" | |
| }, | |
| { | |
| "start": 145.239, | |
| "text": "practice in state-ofthe-art uh language" | |
| }, | |
| { | |
| "start": 147.28, | |
| "text": "models people use a lot more complicated" | |
| }, | |
| { | |
| "start": 148.959, | |
| "text": "schemes unfortunately" | |
| }, | |
| { | |
| "start": 150.44, | |
| "text": "uh for constructing these uh token" | |
| }, | |
| { | |
| "start": 154.36, | |
| "text": "vocabularies so we're not dealing on the" | |
| }, | |
| { | |
| "start": 156.64, | |
| "text": "Character level we're dealing on chunk" | |
| }, | |
| { | |
| "start": 158.64, | |
| "text": "level and the way these um character" | |
| }, | |
| { | |
| "start": 161.519, | |
| "text": "chunks are constructed is using" | |
| }, | |
| { | |
| "start": 163.879, | |
| "text": "algorithms such as for example the bik" | |
| }, | |
| { | |
| "start": 165.48, | |
| "text": "pair in coding algorithm which we're" | |
| }, | |
| { | |
| "start": 166.959, | |
| "text": "going to go into in detail um and cover" | |
| }, | |
| { | |
| "start": 171.0, | |
| "text": "in this video I'd like to briefly show" | |
| }, | |
| { | |
| "start": 172.879, | |
| "text": "you the paper that introduced a bite" | |
| }, | |
| { | |
| "start": 174.84, | |
| "text": "level encoding as a mechanism for" | |
| }, | |
| { | |
| "start": 176.92, | |
| "text": "tokenization in the context of large" | |
| }, | |
| { | |
| "start": 178.44, | |
| "text": "language models and I would say that" | |
| }, | |
| { | |
| "start": 180.599, | |
| "text": "that's probably the gpt2 paper and if" | |
| }, | |
| { | |
| "start": 182.72, | |
| "text": "you scroll down here to the section" | |
| }, | |
| { | |
| "start": 185.56, | |
| "text": "input representation this is where they" | |
| }, | |
| { | |
| "start": 187.72, | |
| "text": "cover tokenization the kinds of" | |
| }, | |
| { | |
| "start": 189.48, | |
| "text": "properties that you'd like the" | |
| }, | |
| { | |
| "start": 190.56, | |
| "text": "tokenization to have and they conclude" | |
| }, | |
| { | |
| "start": 193.0, | |
| "text": "here that they're going to have a" | |
| }, | |
| { | |
| "start": 194.959, | |
| "text": "tokenizer where you have a vocabulary of" | |
| }, | |
| { | |
| "start": 197.599, | |
| "text": "50,2 57 possible" | |
| }, | |
| { | |
| "start": 200.68, | |
| "text": "tokens and the context size is going to" | |
| }, | |
| { | |
| "start": 204.4, | |
| "text": "be 1,24 tokens so in the in in the" | |
| }, | |
| { | |
| "start": 207.36, | |
| "text": "attention layer of the Transformer" | |
| }, | |
| { | |
| "start": 209.239, | |
| "text": "neural network" | |
| }, | |
| { | |
| "start": 210.48, | |
| "text": "every single token is attending to the" | |
| }, | |
| { | |
| "start": 212.319, | |
| "text": "previous tokens in the sequence and it's" | |
| }, | |
| { | |
| "start": 214.08, | |
| "text": "going to see up to 1,24 tokens so tokens" | |
| }, | |
| { | |
| "start": 217.92, | |
| "text": "are this like fundamental unit um the" | |
| }, | |
| { | |
| "start": 220.68, | |
| "text": "atom of uh large language models if you" | |
| }, | |
| { | |
| "start": 223.12, | |
| "text": "will and everything is in units of" | |
| }, | |
| { | |
| "start": 224.799, | |
| "text": "tokens everything is about tokens and" | |
| }, | |
| { | |
| "start": 227.08, | |
| "text": "tokenization is the process for" | |
| }, | |
| { | |
| "start": 228.36, | |
| "text": "translating strings or text into" | |
| }, | |
| { | |
| "start": 231.08, | |
| "text": "sequences of tokens and uh vice versa" | |
| }, | |
| { | |
| "start": 234.879, | |
| "text": "when you go into the Llama 2 paper as" | |
| }, | |
| { | |
| "start": 236.879, | |
| "text": "well I can show you that when you search" | |
| }, | |
| { | |
| "start": 238.28, | |
| "text": "token you're going to get get 63 hits um" | |
| }, | |
| { | |
| "start": 241.72, | |
| "text": "and that's because tokens are again" | |
| }, | |
| { | |
| "start": 243.319, | |
| "text": "pervasive so here they mentioned that" | |
| }, | |
| { | |
| "start": 245.12, | |
| "text": "they trained on two trillion tokens of" | |
| }, | |
| { | |
| "start": 246.879, | |
| "text": "data and so" | |
| }, | |
| { | |
| "start": 248.439, | |
| "text": "on so we're going to build our own" | |
| }, | |
| { | |
| "start": 251.079, | |
| "text": "tokenizer luckily the bite be encoding" | |
| }, | |
| { | |
| "start": 253.04, | |
| "text": "algorithm is not uh that super" | |
| }, | |
| { | |
| "start": 255.12, | |
| "text": "complicated and we can build it from" | |
| }, | |
| { | |
| "start": 256.959, | |
| "text": "scratch ourselves and we'll see exactly" | |
| }, | |
| { | |
| "start": 258.519, | |
| "text": "how this works before we dive into code" | |
| }, | |
| { | |
| "start": 260.72, | |
| "text": "I'd like to give you a brief Taste of" | |
| }, | |
| { | |
| "start": 262.56, | |
| "text": "some of the complexities that come from" | |
| }, | |
| { | |
| "start": 264.12, | |
| "text": "the tokenization because I just want to" | |
| }, | |
| { | |
| "start": 266.12, | |
| "text": "make sure that we motivate it" | |
| }, | |
| { | |
| "start": 267.199, | |
| "text": "sufficiently for why we are doing all" | |
| }, | |
| { | |
| "start": 269.479, | |
| "text": "this and why this is so gross so" | |
| }, | |
| { | |
| "start": 272.639, | |
| "text": "tokenization is at the heart of a lot of" | |
| }, | |
| { | |
| "start": 274.199, | |
| "text": "weirdness in large language models and I" | |
| }, | |
| { | |
| "start": 276.12, | |
| "text": "would advise that you do not brush it" | |
| }, | |
| { | |
| "start": 277.759, | |
| "text": "off a lot of the issues that may look" | |
| }, | |
| { | |
| "start": 280.6, | |
| "text": "like just issues with the new network" | |
| }, | |
| { | |
| "start": 282.32, | |
| "text": "architecture or the large language model" | |
| }, | |
| { | |
| "start": 284.52, | |
| "text": "itself are actually issues with the" | |
| }, | |
| { | |
| "start": 286.6, | |
| "text": "tokenization and fundamentally Trace uh" | |
| }, | |
| { | |
| "start": 289.16, | |
| "text": "back to it so if you've noticed any" | |
| }, | |
| { | |
| "start": 291.759, | |
| "text": "issues with large language models can't" | |
| }, | |
| { | |
| "start": 294.24, | |
| "text": "you know not able to do spelling tasks" | |
| }, | |
| { | |
| "start": 296.16, | |
| "text": "very easily that's usually due to" | |
| }, | |
| { | |
| "start": 297.96, | |
| "text": "tokenization simple string processing" | |
| }, | |
| { | |
| "start": 300.16, | |
| "text": "can be difficult for the large language" | |
| }, | |
| { | |
| "start": 302.28, | |
| "text": "model to perform" | |
| }, | |
| { | |
| "start": 303.6, | |
| "text": "natively uh non-english languages can" | |
| }, | |
| { | |
| "start": 306.08, | |
| "text": "work much worse and to a large extent" | |
| }, | |
| { | |
| "start": 308.24, | |
| "text": "this is due to" | |
| }, | |
| { | |
| "start": 309.44, | |
| "text": "tokenization sometimes llms are bad at" | |
| }, | |
| { | |
| "start": 311.759, | |
| "text": "simple arithmetic also can trace be" | |
| }, | |
| { | |
| "start": 314.08, | |
| "text": "traced to" | |
| }, | |
| { | |
| "start": 315.479, | |
| "text": "tokenization uh gbt2 specifically would" | |
| }, | |
| { | |
| "start": 317.759, | |
| "text": "have had quite a bit more issues with" | |
| }, | |
| { | |
| "start": 319.639, | |
| "text": "python than uh future versions of it due" | |
| }, | |
| { | |
| "start": 322.16, | |
| "text": "to tokenization there's a lot of other" | |
| }, | |
| { | |
| "start": 324.4, | |
| "text": "issues maybe you've seen weird warnings" | |
| }, | |
| { | |
| "start": 325.88, | |
| "text": "about a trailing whites space this is a" | |
| }, | |
| { | |
| "start": 327.44, | |
| "text": "tokenization issue um" | |
| }, | |
| { | |
| "start": 330.68, | |
| "text": "if you had asked GPT earlier about solid" | |
| }, | |
| { | |
| "start": 333.52, | |
| "text": "gold Magikarp and what it is you would" | |
| }, | |
| { | |
| "start": 335.24, | |
| "text": "see the llm go totally crazy and it" | |
| }, | |
| { | |
| "start": 337.52, | |
| "text": "would start going off about a completely" | |
| }, | |
| { | |
| "start": 339.56, | |
| "text": "unrelated tangent topic maybe you've" | |
| }, | |
| { | |
| "start": 341.919, | |
| "text": "been told to use yl over Json in" | |
| }, | |
| { | |
| "start": 343.72, | |
| "text": "structure data all of that has to do" | |
| }, | |
| { | |
| "start": 345.44, | |
| "text": "with tokenization so basically" | |
| }, | |
| { | |
| "start": 347.639, | |
| "text": "tokenization is at the heart of many" | |
| }, | |
| { | |
| "start": 349.4, | |
| "text": "issues I will look back around to these" | |
| }, | |
| { | |
| "start": 351.88, | |
| "text": "at the end of the video but for now let" | |
| }, | |
| { | |
| "start": 354.08, | |
| "text": "me just um skip over it a little bit and" | |
| }, | |
| { | |
| "start": 356.919, | |
| "text": "let's go to this web app um the Tik" | |
| }, | |
| { | |
| "start": 359.96, | |
| "text": "tokenizer bell.app so I have it loaded" | |
| }, | |
| { | |
| "start": 362.919, | |
| "text": "here and what I like about this web app" | |
| }, | |
| { | |
| "start": 364.68, | |
| "text": "is that tokenization is running a sort" | |
| }, | |
| { | |
| "start": 366.56, | |
| "text": "of live in your browser in JavaScript so" | |
| }, | |
| { | |
| "start": 369.52, | |
| "text": "you can just type here stuff hello world" | |
| }, | |
| { | |
| "start": 371.96, | |
| "text": "and the whole string" | |
| }, | |
| { | |
| "start": 374.199, | |
| "text": "rokenes so here what we see on uh the" | |
| }, | |
| { | |
| "start": 378.479, | |
| "text": "left is a string that you put in on the" | |
| }, | |
| { | |
| "start": 380.36, | |
| "text": "right we're currently using the gpt2" | |
| }, | |
| { | |
| "start": 382.199, | |
| "text": "tokenizer we see that this string that I" | |
| }, | |
| { | |
| "start": 384.56, | |
| "text": "pasted here is currently tokenizing into" | |
| }, | |
| { | |
| "start": 387.08, | |
| "text": "300 tokens and here they are sort of uh" | |
| }, | |
| { | |
| "start": 390.52, | |
| "text": "shown explicitly in different colors for" | |
| }, | |
| { | |
| "start": 392.68, | |
| "text": "every single token so for example uh" | |
| }, | |
| { | |
| "start": 395.52, | |
| "text": "this word tokenization became two tokens" | |
| }, | |
| { | |
| "start": 398.88, | |
| "text": "the token" | |
| }, | |
| { | |
| "start": 400.72, | |
| "text": "3,642 and" | |
| }, | |
| { | |
| "start": 404.0, | |
| "text": "1,634 the token um space is is token 318" | |
| }, | |
| { | |
| "start": 410.16, | |
| "text": "so be careful on the bottom you can show" | |
| }, | |
| { | |
| "start": 411.919, | |
| "text": "white space and keep in mind that there" | |
| }, | |
| { | |
| "start": 414.599, | |
| "text": "are spaces and uh sln new line" | |
| }, | |
| { | |
| "start": 417.36, | |
| "text": "characters in here but you can hide them" | |
| }, | |
| { | |
| "start": 419.72, | |
| "text": "for" | |
| }, | |
| { | |
| "start": 421.599, | |
| "text": "clarity the token space at is token 379" | |
| }, | |
| { | |
| "start": 426.0, | |
| "text": "the to the Token space the is 262 Etc so" | |
| }, | |
| { | |
| "start": 431.08, | |
| "text": "you notice here that the space is part" | |
| }, | |
| { | |
| "start": 432.96, | |
| "text": "of that uh token" | |
| }, | |
| { | |
| "start": 435.96, | |
| "text": "chunk now so this is kind of like how" | |
| }, | |
| { | |
| "start": 438.639, | |
| "text": "our English sentence broke up and that" | |
| }, | |
| { | |
| "start": 441.16, | |
| "text": "seems all well and good now now here I" | |
| }, | |
| { | |
| "start": 444.039, | |
| "text": "put in some arithmetic so we see that uh" | |
| }, | |
| { | |
| "start": 446.919, | |
| "text": "the token 127 Plus and then token six" | |
| }, | |
| { | |
| "start": 451.8, | |
| "text": "space 6 followed by 77 so what's" | |
| }, | |
| { | |
| "start": 454.24, | |
| "text": "happening here is that 127 is feeding in" | |
| }, | |
| { | |
| "start": 456.639, | |
| "text": "as a single token into the large" | |
| }, | |
| { | |
| "start": 458.16, | |
| "text": "language model but the um number 677" | |
| }, | |
| { | |
| "start": 462.68, | |
| "text": "will actually feed in as two separate" | |
| }, | |
| { | |
| "start": 464.84, | |
| "text": "tokens and so the large language model" | |
| }, | |
| { | |
| "start": 467.0, | |
| "text": "has to sort of um take account of that" | |
| }, | |
| { | |
| "start": 470.72, | |
| "text": "and process it correctly in its Network" | |
| }, | |
| { | |
| "start": 473.879, | |
| "text": "and see here 804 will be broken up into" | |
| }, | |
| { | |
| "start": 476.199, | |
| "text": "two tokens and it's is all completely" | |
| }, | |
| { | |
| "start": 477.96, | |
| "text": "arbitrary and here I have another" | |
| }, | |
| { | |
| "start": 479.8, | |
| "text": "example of four-digit numbers and they" | |
| }, | |
| { | |
| "start": 482.039, | |
| "text": "break up in a way that they break up and" | |
| }, | |
| { | |
| "start": 483.919, | |
| "text": "it's totally arbitrary sometimes you" | |
| }, | |
| { | |
| "start": 485.28, | |
| "text": "have um multiple digits single token" | |
| }, | |
| { | |
| "start": 488.36, | |
| "text": "sometimes you have individual digits as" | |
| }, | |
| { | |
| "start": 490.36, | |
| "text": "many tokens and it's all kind of pretty" | |
| }, | |
| { | |
| "start": 492.24, | |
| "text": "arbitrary and coming out of the" | |
| }, | |
| { | |
| "start": 494.68, | |
| "text": "tokenizer here's another example we have" | |
| }, | |
| { | |
| "start": 497.479, | |
| "text": "the string egg and you see here that" | |
| }, | |
| { | |
| "start": 501.039, | |
| "text": "this became two" | |
| }, | |
| { | |
| "start": 502.36, | |
| "text": "tokens but for some reason when I say I" | |
| }, | |
| { | |
| "start": 504.759, | |
| "text": "have an egg you see when it's a space" | |
| }, | |
| { | |
| "start": 507.72, | |
| "text": "egg it's two token it's sorry it's a" | |
| }, | |
| { | |
| "start": 510.84, | |
| "text": "single token so just egg by itself in" | |
| }, | |
| { | |
| "start": 513.24, | |
| "text": "the beginning of a sentence is two" | |
| }, | |
| { | |
| "start": 514.76, | |
| "text": "tokens but here as a space egg is" | |
| }, | |
| { | |
| "start": 517.68, | |
| "text": "suddenly a single token uh for the exact" | |
| }, | |
| { | |
| "start": 520.519, | |
| "text": "same string okay here lowercase egg" | |
| }, | |
| { | |
| "start": 524.2, | |
| "text": "turns out to be a single token and in" | |
| }, | |
| { | |
| "start": 526.24, | |
| "text": "particular notice that the color is" | |
| }, | |
| { | |
| "start": 527.48, | |
| "text": "different so this is a different token" | |
| }, | |
| { | |
| "start": 529.36, | |
| "text": "so this is case sensitive and of course" | |
| }, | |
| { | |
| "start": 531.76, | |
| "text": "a capital egg would also be different" | |
| }, | |
| { | |
| "start": 534.56, | |
| "text": "tokens and again um this would be two" | |
| }, | |
| { | |
| "start": 537.44, | |
| "text": "tokens arbitrarily so so for the same" | |
| }, | |
| { | |
| "start": 540.079, | |
| "text": "concept egg depending on if it's in the" | |
| }, | |
| { | |
| "start": 542.32, | |
| "text": "beginning of a sentence at the end of a" | |
| }, | |
| { | |
| "start": 543.8, | |
| "text": "sentence lowercase uppercase or mixed" | |
| }, | |
| { | |
| "start": 546.24, | |
| "text": "all this will be uh basically very" | |
| }, | |
| { | |
| "start": 548.079, | |
| "text": "different tokens and different IDs and" | |
| }, | |
| { | |
| "start": 550.32, | |
| "text": "the language model has to learn from raw" | |
| }, | |
| { | |
| "start": 552.04, | |
| "text": "data from all the internet text that" | |
| }, | |
| { | |
| "start": 553.56, | |
| "text": "it's going to be training on that these" | |
| }, | |
| { | |
| "start": 555.16, | |
| "text": "are actually all the exact same concept" | |
| }, | |
| { | |
| "start": 557.44, | |
| "text": "and it has to sort of group them in the" | |
| }, | |
| { | |
| "start": 559.279, | |
| "text": "parameters of the neural network and" | |
| }, | |
| { | |
| "start": 561.32, | |
| "text": "understand just based on the data" | |
| }, | |
| { | |
| "start": 562.48, | |
| "text": "patterns that these are all very similar" | |
| }, | |
| { | |
| "start": 564.76, | |
| "text": "but maybe not almost exactly similar but" | |
| }, | |
| { | |
| "start": 567.399, | |
| "text": "but very very similar" | |
| }, | |
| { | |
| "start": 570.16, | |
| "text": "um after the EG demonstration here I" | |
| }, | |
| { | |
| "start": 572.8, | |
| "text": "have um an introduction from open a eyes" | |
| }, | |
| { | |
| "start": 575.64, | |
| "text": "chbt in Korean so manaso Pang uh Etc uh" | |
| }, | |
| { | |
| "start": 581.959, | |
| "text": "so this is in Korean and the reason I" | |
| }, | |
| { | |
| "start": 584.079, | |
| "text": "put this here is because you'll notice" | |
| }, | |
| { | |
| "start": 587.76, | |
| "text": "that um non-english languages work" | |
| }, | |
| { | |
| "start": 591.0, | |
| "text": "slightly worse in Chachi part of this is" | |
| }, | |
| { | |
| "start": 594.32, | |
| "text": "because of course the training data set" | |
| }, | |
| { | |
| "start": 595.64, | |
| "text": "for Chachi is much larger for English" | |
| }, | |
| { | |
| "start": 598.079, | |
| "text": "and for everything else but the same is" | |
| }, | |
| { | |
| "start": 599.959, | |
| "text": "true not just for the large language" | |
| }, | |
| { | |
| "start": 601.68, | |
| "text": "model itself but also for the tokenizer" | |
| }, | |
| { | |
| "start": 604.32, | |
| "text": "so when we train the tokenizer we're" | |
| }, | |
| { | |
| "start": 605.88, | |
| "text": "going to see that there's a training set" | |
| }, | |
| { | |
| "start": 607.24, | |
| "text": "as well and there's a lot more English" | |
| }, | |
| { | |
| "start": 609.24, | |
| "text": "than non-english and what ends up" | |
| }, | |
| { | |
| "start": 611.32, | |
| "text": "happening is that we're going to have a" | |
| }, | |
| { | |
| "start": 613.48, | |
| "text": "lot more longer tokens for" | |
| }, | |
| { | |
| "start": 616.6, | |
| "text": "English so how do I put this if you have" | |
| }, | |
| { | |
| "start": 619.6, | |
| "text": "a single sentence in English and you" | |
| }, | |
| { | |
| "start": 621.399, | |
| "text": "tokenize it you might see that it's 10" | |
| }, | |
| { | |
| "start": 623.56, | |
| "text": "tokens or something like that but if you" | |
| }, | |
| { | |
| "start": 625.48, | |
| "text": "translate that sentence into say Korean" | |
| }, | |
| { | |
| "start": 627.36, | |
| "text": "or Japanese or something else you'll" | |
| }, | |
| { | |
| "start": 629.44, | |
| "text": "typically see that the number of tokens" | |
| }, | |
| { | |
| "start": 630.839, | |
| "text": "used is much larger and that's because" | |
| }, | |
| { | |
| "start": 633.399, | |
| "text": "the chunks here are a lot more broken up" | |
| }, | |
| { | |
| "start": 636.76, | |
| "text": "so we're using a lot more tokens for the" | |
| }, | |
| { | |
| "start": 638.519, | |
| "text": "exact same thing and what this does is" | |
| }, | |
| { | |
| "start": 641.36, | |
| "text": "it bloats up the sequence length of all" | |
| }, | |
| { | |
| "start": 643.76, | |
| "text": "the documents so you're using up more" | |
| }, | |
| { | |
| "start": 646.24, | |
| "text": "tokens and then in the attention of the" | |
| }, | |
| { | |
| "start": 648.399, | |
| "text": "Transformer when these tokens try to" | |
| }, | |
| { | |
| "start": 649.92, | |
| "text": "attend each other you are running out of" | |
| }, | |
| { | |
| "start": 651.92, | |
| "text": "context um in the maximum context length" | |
| }, | |
| { | |
| "start": 655.12, | |
| "text": "of that Transformer and so basically all" | |
| }, | |
| { | |
| "start": 657.959, | |
| "text": "the non-english text is stretched out" | |
| }, | |
| { | |
| "start": 661.279, | |
| "text": "from the perspective of the Transformer" | |
| }, | |
| { | |
| "start": 663.44, | |
| "text": "and this just has to do with the um" | |
| }, | |
| { | |
| "start": 665.68, | |
| "text": "trainings that used for the tokenizer" | |
| }, | |
| { | |
| "start": 667.48, | |
| "text": "and the tokenization itself so it will" | |
| }, | |
| { | |
| "start": 670.04, | |
| "text": "create a lot bigger tokens and a lot" | |
| }, | |
| { | |
| "start": 672.079, | |
| "text": "larger groups in English and it will" | |
| }, | |
| { | |
| "start": 674.2, | |
| "text": "have a lot of little boundaries for all" | |
| }, | |
| { | |
| "start": 676.16, | |
| "text": "the other non-english text um so if we" | |
| }, | |
| { | |
| "start": 679.76, | |
| "text": "translated this into English it would be" | |
| }, | |
| { | |
| "start": 681.92, | |
| "text": "significantly fewer" | |
| }, | |
| { | |
| "start": 683.32, | |
| "text": "tokens the final example I have here is" | |
| }, | |
| { | |
| "start": 685.639, | |
| "text": "a little snippet of python for doing FS" | |
| }, | |
| { | |
| "start": 688.079, | |
| "text": "buuz and what I'd like you to notice is" | |
| }, | |
| { | |
| "start": 691.0, | |
| "text": "look all these individual spaces are all" | |
| }, | |
| { | |
| "start": 694.04, | |
| "text": "separate tokens they are token" | |
| }, | |
| { | |
| "start": 697.0, | |
| "text": "220 so uh 220 220 220 220 and then space" | |
| }, | |
| { | |
| "start": 702.76, | |
| "text": "if is a single token and so what's going" | |
| }, | |
| { | |
| "start": 705.32, | |
| "text": "on here is that when the Transformer is" | |
| }, | |
| { | |
| "start": 706.72, | |
| "text": "going to consume or try to uh create" | |
| }, | |
| { | |
| "start": 709.32, | |
| "text": "this text it needs to um handle all" | |
| }, | |
| { | |
| "start": 712.639, | |
| "text": "these spaces individually they all feed" | |
| }, | |
| { | |
| "start": 714.48, | |
| "text": "in one by one into the entire" | |
| }, | |
| { | |
| "start": 716.56, | |
| "text": "Transformer in the sequence and so this" | |
| }, | |
| { | |
| "start": 719.12, | |
| "text": "is being extremely wasteful tokenizing" | |
| }, | |
| { | |
| "start": 721.279, | |
| "text": "it in this way and so as a result of" | |
| }, | |
| { | |
| "start": 724.44, | |
| "text": "that gpt2 is not very good with python" | |
| }, | |
| { | |
| "start": 727.04, | |
| "text": "and it's not anything to do with coding" | |
| }, | |
| { | |
| "start": 728.68, | |
| "text": "or the language model itself it's just" | |
| }, | |
| { | |
| "start": 730.68, | |
| "text": "that if he use a lot of indentation" | |
| }, | |
| { | |
| "start": 732.079, | |
| "text": "using space in Python like we usually do" | |
| }, | |
| { | |
| "start": 735.399, | |
| "text": "uh you just end up bloating out all the" | |
| }, | |
| { | |
| "start": 737.399, | |
| "text": "text and it's separated across way too" | |
| }, | |
| { | |
| "start": 739.36, | |
| "text": "much of the sequence and we are running" | |
| }, | |
| { | |
| "start": 741.04, | |
| "text": "out of the context length in the" | |
| }, | |
| { | |
| "start": 742.76, | |
| "text": "sequence uh that's roughly speaking" | |
| }, | |
| { | |
| "start": 744.44, | |
| "text": "what's what's happening we're being way" | |
| }, | |
| { | |
| "start": 745.639, | |
| "text": "too wasteful we're taking up way too" | |
| }, | |
| { | |
| "start": 747.399, | |
| "text": "much token space now we can also scroll" | |
| }, | |
| { | |
| "start": 749.68, | |
| "text": "up here and we can change the tokenizer" | |
| }, | |
| { | |
| "start": 751.6, | |
| "text": "so note here that gpt2 tokenizer creates" | |
| }, | |
| { | |
| "start": 754.04, | |
| "text": "a token count of 300 for this string" | |
| }, | |
| { | |
| "start": 756.72, | |
| "text": "here we can change it to CL 100K base" | |
| }, | |
| { | |
| "start": 759.519, | |
| "text": "which is the GPT for tokenizer and we" | |
| }, | |
| { | |
| "start": 761.839, | |
| "text": "see that the token count drops to 185 so" | |
| }, | |
| { | |
| "start": 764.56, | |
| "text": "for the exact same string we are now" | |
| }, | |
| { | |
| "start": 766.8, | |
| "text": "roughly having the number of tokens and" | |
| }, | |
| { | |
| "start": 769.8, | |
| "text": "roughly speaking this is because uh the" | |
| }, | |
| { | |
| "start": 771.76, | |
| "text": "number of tokens in the GPT 4 tokenizer" | |
| }, | |
| { | |
| "start": 774.36, | |
| "text": "is roughly double that of the number of" | |
| }, | |
| { | |
| "start": 776.72, | |
| "text": "tokens in the gpt2 tokenizer so we went" | |
| }, | |
| { | |
| "start": 778.839, | |
| "text": "went from roughly 50k to roughly 100K" | |
| }, | |
| { | |
| "start": 781.639, | |
| "text": "now you can imagine that this is a good" | |
| }, | |
| { | |
| "start": 783.0, | |
| "text": "thing because the same text is now" | |
| }, | |
| { | |
| "start": 786.0, | |
| "text": "squished into half as many tokens so uh" | |
| }, | |
| { | |
| "start": 790.199, | |
| "text": "this is a lot denser input to the" | |
| }, | |
| { | |
| "start": 792.76, | |
| "text": "Transformer and in the Transformer every" | |
| }, | |
| { | |
| "start": 795.44, | |
| "text": "single token has a finite number of" | |
| }, | |
| { | |
| "start": 797.04, | |
| "text": "tokens before it that it's going to pay" | |
| }, | |
| { | |
| "start": 798.399, | |
| "text": "attention to and so what this is doing" | |
| }, | |
| { | |
| "start": 800.44, | |
| "text": "is we're roughly able to see twice as" | |
| }, | |
| { | |
| "start": 803.48, | |
| "text": "much text as a context for what token to" | |
| }, | |
| { | |
| "start": 806.519, | |
| "text": "predict next uh because of this change" | |
| }, | |
| { | |
| "start": 809.279, | |
| "text": "but of course just increasing the number" | |
| }, | |
| { | |
| "start": 810.8, | |
| "text": "of tokens is uh not strictly better" | |
| }, | |
| { | |
| "start": 813.399, | |
| "text": "infinitely uh because as you increase" | |
| }, | |
| { | |
| "start": 815.16, | |
| "text": "the number of tokens now your embedding" | |
| }, | |
| { | |
| "start": 816.92, | |
| "text": "table is um sort of getting a lot larger" | |
| }, | |
| { | |
| "start": 819.88, | |
| "text": "and also at the output we are trying to" | |
| }, | |
| { | |
| "start": 821.48, | |
| "text": "predict the next token and there's the" | |
| }, | |
| { | |
| "start": 822.88, | |
| "text": "soft Max there and that grows as well" | |
| }, | |
| { | |
| "start": 825.12, | |
| "text": "we're going to go into more detail later" | |
| }, | |
| { | |
| "start": 826.399, | |
| "text": "on this but there's some kind of a Sweet" | |
| }, | |
| { | |
| "start": 828.44, | |
| "text": "Spot somewhere where you have a just" | |
| }, | |
| { | |
| "start": 831.0, | |
| "text": "right number of tokens in your" | |
| }, | |
| { | |
| "start": 832.279, | |
| "text": "vocabulary where everything is" | |
| }, | |
| { | |
| "start": 833.88, | |
| "text": "appropriately dense and still fairly" | |
| }, | |
| { | |
| "start": 836.519, | |
| "text": "efficient now one thing I would like you" | |
| }, | |
| { | |
| "start": 838.36, | |
| "text": "to note specifically for the gp4" | |
| }, | |
| { | |
| "start": 840.16, | |
| "text": "tokenizer is that the handling of the" | |
| }, | |
| { | |
| "start": 843.56, | |
| "text": "white space for python has improved a" | |
| }, | |
| { | |
| "start": 845.44, | |
| "text": "lot you see that here these four spaces" | |
| }, | |
| { | |
| "start": 848.36, | |
| "text": "are represented as one single token for" | |
| }, | |
| { | |
| "start": 850.24, | |
| "text": "the three spaces here and then the token" | |
| }, | |
| { | |
| "start": 853.759, | |
| "text": "SPF and here seven spaces were all" | |
| }, | |
| { | |
| "start": 856.759, | |
| "text": "grouped into a single token so we're" | |
| }, | |
| { | |
| "start": 858.8, | |
| "text": "being a lot more efficient in how we" | |
| }, | |
| { | |
| "start": 860.199, | |
| "text": "represent Python and this was a" | |
| }, | |
| { | |
| "start": 861.92, | |
| "text": "deliberate Choice made by open aai when" | |
| }, | |
| { | |
| "start": 863.759, | |
| "text": "they designed the gp4 tokenizer and they" | |
| }, | |
| { | |
| "start": 867.56, | |
| "text": "group a lot more space into a single" | |
| }, | |
| { | |
| "start": 869.68, | |
| "text": "character what this does is this" | |
| }, | |
| { | |
| "start": 872.079, | |
| "text": "densifies Python and therefore we can" | |
| }, | |
| { | |
| "start": 875.199, | |
| "text": "attend to more code before it when we're" | |
| }, | |
| { | |
| "start": 878.12, | |
| "text": "trying to predict the next token in the" | |
| }, | |
| { | |
| "start": 879.72, | |
| "text": "sequence and so the Improvement in the" | |
| }, | |
| { | |
| "start": 882.04, | |
| "text": "python coding ability from gbt2 to gp4" | |
| }, | |
| { | |
| "start": 885.399, | |
| "text": "is not just a matter of the language" | |
| }, | |
| { | |
| "start": 887.079, | |
| "text": "model and the architecture and the" | |
| }, | |
| { | |
| "start": 888.839, | |
| "text": "details of the optimization but a lot of" | |
| }, | |
| { | |
| "start": 890.759, | |
| "text": "the Improvement here is also coming from" | |
| }, | |
| { | |
| "start": 892.24, | |
| "text": "the design of the tokenizer and how it" | |
| }, | |
| { | |
| "start": 894.24, | |
| "text": "groups characters into tokens okay so" | |
| }, | |
| { | |
| "start": 896.959, | |
| "text": "let's now start writing some code" | |
| }, | |
| { | |
| "start": 899.399, | |
| "text": "so remember what we want to do we want" | |
| }, | |
| { | |
| "start": 901.44, | |
| "text": "to take strings and feed them into" | |
| }, | |
| { | |
| "start": 903.72, | |
| "text": "language models for that we need to" | |
| }, | |
| { | |
| "start": 905.959, | |
| "text": "somehow tokenize strings into some" | |
| }, | |
| { | |
| "start": 908.8, | |
| "text": "integers in some fixed vocabulary and" | |
| }, | |
| { | |
| "start": 912.36, | |
| "text": "then we will use those integers to make" | |
| }, | |
| { | |
| "start": 914.24, | |
| "text": "a look up into a lookup table of vectors" | |
| }, | |
| { | |
| "start": 916.759, | |
| "text": "and feed those vectors into the" | |
| }, | |
| { | |
| "start": 918.0, | |
| "text": "Transformer as an input now the reason" | |
| }, | |
| { | |
| "start": 921.36, | |
| "text": "this gets a little bit tricky of course" | |
| }, | |
| { | |
| "start": 922.72, | |
| "text": "is that we don't just want to support" | |
| }, | |
| { | |
| "start": 924.0, | |
| "text": "the simple English alphabet we want to" | |
| }, | |
| { | |
| "start": 926.12, | |
| "text": "support different kinds of languages so" | |
| }, | |
| { | |
| "start": 928.12, | |
| "text": "this is anango in Korean which is hello" | |
| }, | |
| { | |
| "start": 931.639, | |
| "text": "and we also want to support many kinds" | |
| }, | |
| { | |
| "start": 933.0, | |
| "text": "of special characters that we might find" | |
| }, | |
| { | |
| "start": 934.72, | |
| "text": "on the internet for example" | |
| }, | |
| { | |
| "start": 937.319, | |
| "text": "Emoji so how do we feed this text into" | |
| }, | |
| { | |
| "start": 941.48, | |
| "text": "uh" | |
| }, | |
| { | |
| "start": 942.199, | |
| "text": "Transformers well how's the what is this" | |
| }, | |
| { | |
| "start": 944.48, | |
| "text": "text anyway in Python so if you go to" | |
| }, | |
| { | |
| "start": 946.56, | |
| "text": "the documentation of a string in Python" | |
| }, | |
| { | |
| "start": 949.6, | |
| "text": "you can see that strings are immutable" | |
| }, | |
| { | |
| "start": 951.519, | |
| "text": "sequences of Unicode code" | |
| }, | |
| { | |
| "start": 954.12, | |
| "text": "points okay what are Unicode code points" | |
| }, | |
| { | |
| "start": 957.88, | |
| "text": "we can go to PDF so Unicode code points" | |
| }, | |
| { | |
| "start": 961.48, | |
| "text": "are defined by the Unicode Consortium as" | |
| }, | |
| { | |
| "start": 964.68, | |
| "text": "part of the Unicode standard and what" | |
| }, | |
| { | |
| "start": 967.56, | |
| "text": "this is really is that it's just a" | |
| }, | |
| { | |
| "start": 969.0, | |
| "text": "definition of roughly 150,000 characters" | |
| }, | |
| { | |
| "start": 971.839, | |
| "text": "right now and roughly speaking what they" | |
| }, | |
| { | |
| "start": 974.72, | |
| "text": "look like and what integers um represent" | |
| }, | |
| { | |
| "start": 977.56, | |
| "text": "those characters so it says 150,000" | |
| }, | |
| { | |
| "start": 979.72, | |
| "text": "characters across 161 scripts as of" | |
| }, | |
| { | |
| "start": 982.639, | |
| "text": "right now so if you scroll down here you" | |
| }, | |
| { | |
| "start": 984.72, | |
| "text": "can see that the standard is very much" | |
| }, | |
| { | |
| "start": 986.279, | |
| "text": "alive the latest standard 15.1 in" | |
| }, | |
| { | |
| "start": 988.72, | |
| "text": "September" | |
| }, | |
| { | |
| "start": 990.199, | |
| "text": "2023 and basically this is just a way to" | |
| }, | |
| { | |
| "start": 993.92, | |
| "text": "define lots of types of" | |
| }, | |
| { | |
| "start": 996.92, | |
| "text": "characters like for example all these" | |
| }, | |
| { | |
| "start": 999.16, | |
| "text": "characters across different scripts so" | |
| }, | |
| { | |
| "start": 1001.88, | |
| "text": "the way we can access the unic code code" | |
| }, | |
| { | |
| "start": 1004.04, | |
| "text": "Point given Single Character is by using" | |
| }, | |
| { | |
| "start": 1005.959, | |
| "text": "the or function in Python so for example" | |
| }, | |
| { | |
| "start": 1008.199, | |
| "text": "I can pass in Ord of H and I can see" | |
| }, | |
| { | |
| "start": 1011.279, | |
| "text": "that for the Single Character H the unic" | |
| }, | |
| { | |
| "start": 1014.72, | |
| "text": "code code point is" | |
| }, | |
| { | |
| "start": 1016.48, | |
| "text": "104 okay um but this can be arbitr" | |
| }, | |
| { | |
| "start": 1020.399, | |
| "text": "complicated so we can take for example" | |
| }, | |
| { | |
| "start": 1022.16, | |
| "text": "our Emoji here and we can see that the" | |
| }, | |
| { | |
| "start": 1024.16, | |
| "text": "code point for this one is" | |
| }, | |
| { | |
| "start": 1026.4, | |
| "text": "128,000 or we can take" | |
| }, | |
| { | |
| "start": 1030.36, | |
| "text": "un and this is 50,000 now keep in mind" | |
| }, | |
| { | |
| "start": 1033.72, | |
| "text": "you can't plug in strings here because" | |
| }, | |
| { | |
| "start": 1036.72, | |
| "text": "you uh this doesn't have a single code" | |
| }, | |
| { | |
| "start": 1038.439, | |
| "text": "point it only takes a single uni code" | |
| }, | |
| { | |
| "start": 1040.679, | |
| "text": "code Point character and tells you its" | |
| }, | |
| { | |
| "start": 1043.959, | |
| "text": "integer so in this way we can look" | |
| }, | |
| { | |
| "start": 1046.799, | |
| "text": "up all the um characters of this" | |
| }, | |
| { | |
| "start": 1050.08, | |
| "text": "specific string and their code points so" | |
| }, | |
| { | |
| "start": 1052.16, | |
| "text": "or of X forx in this string and we get" | |
| }, | |
| { | |
| "start": 1056.76, | |
| "text": "this encoding here now see here we've" | |
| }, | |
| { | |
| "start": 1060.36, | |
| "text": "already turned the raw code points" | |
| }, | |
| { | |
| "start": 1062.2, | |
| "text": "already have integers so why can't we" | |
| }, | |
| { | |
| "start": 1064.44, | |
| "text": "simply just use these integers and not" | |
| }, | |
| { | |
| "start": 1066.84, | |
| "text": "have any tokenization at all why can't" | |
| }, | |
| { | |
| "start": 1068.559, | |
| "text": "we just use this natively as is and just" | |
| }, | |
| { | |
| "start": 1070.64, | |
| "text": "use the code Point well one reason for" | |
| }, | |
| { | |
| "start": 1072.88, | |
| "text": "that of course is that the vocabulary in" | |
| }, | |
| { | |
| "start": 1074.36, | |
| "text": "that case would be quite long so in this" | |
| }, | |
| { | |
| "start": 1076.799, | |
| "text": "case for Unicode the this is a" | |
| }, | |
| { | |
| "start": 1078.679, | |
| "text": "vocabulary of" | |
| }, | |
| { | |
| "start": 1079.799, | |
| "text": "150,000 different code points but more" | |
| }, | |
| { | |
| "start": 1082.64, | |
| "text": "worryingly than that I think the Unicode" | |
| }, | |
| { | |
| "start": 1085.039, | |
| "text": "standard is very much alive and it keeps" | |
| }, | |
| { | |
| "start": 1087.039, | |
| "text": "changing and so it's not kind of a" | |
| }, | |
| { | |
| "start": 1089.24, | |
| "text": "stable representation necessarily that" | |
| }, | |
| { | |
| "start": 1091.08, | |
| "text": "we may want to use directly so for those" | |
| }, | |
| { | |
| "start": 1093.88, | |
| "text": "reasons we need something a bit better" | |
| }, | |
| { | |
| "start": 1095.76, | |
| "text": "so to find something better we turn to" | |
| }, | |
| { | |
| "start": 1097.64, | |
| "text": "encodings so if we go to the Wikipedia" | |
| }, | |
| { | |
| "start": 1099.76, | |
| "text": "page here we see that the Unicode" | |
| }, | |
| { | |
| "start": 1101.28, | |
| "text": "consortion defines three types of" | |
| }, | |
| { | |
| "start": 1103.799, | |
| "text": "encodings utf8 UTF 16 and UTF 32 these" | |
| }, | |
| { | |
| "start": 1107.96, | |
| "text": "encoding are the way by which we can" | |
| }, | |
| { | |
| "start": 1110.72, | |
| "text": "take Unicode text and translate it into" | |
| }, | |
| { | |
| "start": 1113.48, | |
| "text": "binary data or by streams utf8 is by far" | |
| }, | |
| { | |
| "start": 1117.2, | |
| "text": "the most common uh so this is the utf8" | |
| }, | |
| { | |
| "start": 1119.96, | |
| "text": "page now this Wikipedia page is actually" | |
| }, | |
| { | |
| "start": 1122.0, | |
| "text": "quite long but what's important for our" | |
| }, | |
| { | |
| "start": 1124.4, | |
| "text": "purposes is that utf8 takes every single" | |
| }, | |
| { | |
| "start": 1126.44, | |
| "text": "Cod point and it translates it to a by" | |
| }, | |
| { | |
| "start": 1129.64, | |
| "text": "stream and this by stream is between one" | |
| }, | |
| { | |
| "start": 1132.36, | |
| "text": "to four bytes so it's a variable length" | |
| }, | |
| { | |
| "start": 1134.36, | |
| "text": "encoding so depending on the Unicode" | |
| }, | |
| { | |
| "start": 1136.48, | |
| "text": "Point according to the schema you're" | |
| }, | |
| { | |
| "start": 1138.039, | |
| "text": "going to end up with between 1 to four" | |
| }, | |
| { | |
| "start": 1139.76, | |
| "text": "bytes for each code point on top of that" | |
| }, | |
| { | |
| "start": 1143.0, | |
| "text": "there's utf8 uh" | |
| }, | |
| { | |
| "start": 1145.12, | |
| "text": "utf16 and UTF 32 UTF 32 is nice because" | |
| }, | |
| { | |
| "start": 1148.84, | |
| "text": "it is fixed length instead of variable" | |
| }, | |
| { | |
| "start": 1150.559, | |
| "text": "length but it has many other downsides" | |
| }, | |
| { | |
| "start": 1152.48, | |
| "text": "as well so the full kind of spectrum of" | |
| }, | |
| { | |
| "start": 1157.0, | |
| "text": "pros and cons of all these different" | |
| }, | |
| { | |
| "start": 1158.32, | |
| "text": "three encodings are beyond the scope of" | |
| }, | |
| { | |
| "start": 1160.48, | |
| "text": "this video I just like to point out that" | |
| }, | |
| { | |
| "start": 1162.52, | |
| "text": "I enjoyed this block post and this block" | |
| }, | |
| { | |
| "start": 1165.24, | |
| "text": "post at the end of it also has a number" | |
| }, | |
| { | |
| "start": 1167.039, | |
| "text": "of references that can be quite useful" | |
| }, | |
| { | |
| "start": 1169.24, | |
| "text": "uh one of them is uh utf8 everywhere" | |
| }, | |
| { | |
| "start": 1172.039, | |
| "text": "Manifesto um and this Manifesto" | |
| }, | |
| { | |
| "start": 1174.32, | |
| "text": "describes the reason why utf8 is" | |
| }, | |
| { | |
| "start": 1176.64, | |
| "text": "significantly preferred and a lot nicer" | |
| }, | |
| { | |
| "start": 1179.88, | |
| "text": "than the other encodings and why it is" | |
| }, | |
| { | |
| "start": 1181.799, | |
| "text": "used a lot more prominently um on the" | |
| }, | |
| { | |
| "start": 1185.48, | |
| "text": "internet one of the major advantages" | |
| }, | |
| { | |
| "start": 1188.08, | |
| "text": "just just to give you a sense is that" | |
| }, | |
| { | |
| "start": 1189.559, | |
| "text": "utf8 is the only one of these that is" | |
| }, | |
| { | |
| "start": 1192.0, | |
| "text": "backwards compatible to the much simpler" | |
| }, | |
| { | |
| "start": 1194.2, | |
| "text": "asky encoding of text um but I'm not" | |
| }, | |
| { | |
| "start": 1197.08, | |
| "text": "going to go into the full detail in this" | |
| }, | |
| { | |
| "start": 1198.48, | |
| "text": "video so suffice to say that we like the" | |
| }, | |
| { | |
| "start": 1201.0, | |
| "text": "utf8 encoding and uh let's try to take" | |
| }, | |
| { | |
| "start": 1203.84, | |
| "text": "the string and see what we get if we" | |
| }, | |
| { | |
| "start": 1206.039, | |
| "text": "encoded into" | |
| }, | |
| { | |
| "start": 1208.0, | |
| "text": "utf8 the string class in Python actually" | |
| }, | |
| { | |
| "start": 1210.76, | |
| "text": "has do encode and you can give it the" | |
| }, | |
| { | |
| "start": 1212.36, | |
| "text": "encoding which is say utf8 now we get" | |
| }, | |
| { | |
| "start": 1215.559, | |
| "text": "out of this is not very nice because" | |
| }, | |
| { | |
| "start": 1217.84, | |
| "text": "this is the bytes is a bytes object and" | |
| }, | |
| { | |
| "start": 1220.96, | |
| "text": "it's not very nice in the way that it's" | |
| }, | |
| { | |
| "start": 1222.76, | |
| "text": "printed so I personally like to take it" | |
| }, | |
| { | |
| "start": 1225.039, | |
| "text": "through list because then we actually" | |
| }, | |
| { | |
| "start": 1226.84, | |
| "text": "get the raw B" | |
| }, | |
| { | |
| "start": 1228.72, | |
| "text": "of this uh encoding so this is the raw" | |
| }, | |
| { | |
| "start": 1232.4, | |
| "text": "byes that represent this string" | |
| }, | |
| { | |
| "start": 1235.6, | |
| "text": "according to the utf8 en coding we can" | |
| }, | |
| { | |
| "start": 1238.08, | |
| "text": "also look at utf16 we get a slightly" | |
| }, | |
| { | |
| "start": 1240.559, | |
| "text": "different by stream and we here we start" | |
| }, | |
| { | |
| "start": 1243.24, | |
| "text": "to see one of the disadvantages of utf16" | |
| }, | |
| { | |
| "start": 1245.48, | |
| "text": "you see how we have zero Z something Z" | |
| }, | |
| { | |
| "start": 1247.96, | |
| "text": "something Z something we're starting to" | |
| }, | |
| { | |
| "start": 1249.679, | |
| "text": "get a sense that this is a bit of a" | |
| }, | |
| { | |
| "start": 1250.84, | |
| "text": "wasteful encoding and indeed for simple" | |
| }, | |
| { | |
| "start": 1253.919, | |
| "text": "asky characters or English characters" | |
| }, | |
| { | |
| "start": 1256.28, | |
| "text": "here uh we just have the structure of 0" | |
| }, | |
| { | |
| "start": 1258.559, | |
| "text": "something Z something and it's not" | |
| }, | |
| { | |
| "start": 1260.76, | |
| "text": "exactly nice same for UTF 32 when we" | |
| }, | |
| { | |
| "start": 1264.24, | |
| "text": "expand this we can start to get a sense" | |
| }, | |
| { | |
| "start": 1266.08, | |
| "text": "of the wastefulness of this encoding for" | |
| }, | |
| { | |
| "start": 1268.0, | |
| "text": "our purposes you see a lot of zeros" | |
| }, | |
| { | |
| "start": 1270.4, | |
| "text": "followed by" | |
| }, | |
| { | |
| "start": 1271.4, | |
| "text": "something and so uh this is not" | |
| }, | |
| { | |
| "start": 1274.84, | |
| "text": "desirable so suffice it to say that we" | |
| }, | |
| { | |
| "start": 1277.84, | |
| "text": "would like to stick with utf8 for our" | |
| }, | |
| { | |
| "start": 1280.88, | |
| "text": "purposes however if we just use utf8" | |
| }, | |
| { | |
| "start": 1283.88, | |
| "text": "naively these are by streams so that" | |
| }, | |
| { | |
| "start": 1286.4, | |
| "text": "would imply a vocabulary length of only" | |
| }, | |
| { | |
| "start": 1289.24, | |
| "text": "256 possible tokens uh but this this" | |
| }, | |
| { | |
| "start": 1293.12, | |
| "text": "vocabulary size is very very small what" | |
| }, | |
| { | |
| "start": 1295.32, | |
| "text": "this is going to do if we just were to" | |
| }, | |
| { | |
| "start": 1296.679, | |
| "text": "use it naively is that all of our text" | |
| }, | |
| { | |
| "start": 1299.88, | |
| "text": "would be stretched out over very very" | |
| }, | |
| { | |
| "start": 1301.919, | |
| "text": "long sequences of bytes and so" | |
| }, | |
| { | |
| "start": 1306.159, | |
| "text": "um what what this does is that certainly" | |
| }, | |
| { | |
| "start": 1309.32, | |
| "text": "the embeding table is going to be tiny" | |
| }, | |
| { | |
| "start": 1311.0, | |
| "text": "and the prediction at the top at the" | |
| }, | |
| { | |
| "start": 1312.32, | |
| "text": "final layer is going to be very tiny but" | |
| }, | |
| { | |
| "start": 1314.159, | |
| "text": "our sequences are very long and remember" | |
| }, | |
| { | |
| "start": 1316.44, | |
| "text": "that we have pretty finite um context" | |
| }, | |
| { | |
| "start": 1319.32, | |
| "text": "length and the attention that we can" | |
| }, | |
| { | |
| "start": 1321.0, | |
| "text": "support in a transformer for" | |
| }, | |
| { | |
| "start": 1322.76, | |
| "text": "computational reasons and so we only" | |
| }, | |
| { | |
| "start": 1325.52, | |
| "text": "have as much context length but now we" | |
| }, | |
| { | |
| "start": 1327.48, | |
| "text": "have very very long sequences and this" | |
| }, | |
| { | |
| "start": 1329.44, | |
| "text": "is just inefficient and it's not going" | |
| }, | |
| { | |
| "start": 1330.799, | |
| "text": "to allow us to attend to sufficiently" | |
| }, | |
| { | |
| "start": 1332.799, | |
| "text": "long text uh before us for the purposes" | |
| }, | |
| { | |
| "start": 1335.64, | |
| "text": "of the next token prediction task so we" | |
| }, | |
| { | |
| "start": 1338.36, | |
| "text": "don't want to use the raw bytes of the" | |
| }, | |
| { | |
| "start": 1341.6, | |
| "text": "utf8 encoding we want to be able to" | |
| }, | |
| { | |
| "start": 1344.2, | |
| "text": "support larger vocabulary size that we" | |
| }, | |
| { | |
| "start": 1346.919, | |
| "text": "can tune as a hyper" | |
| }, | |
| { | |
| "start": 1348.64, | |
| "text": "but we want to stick with the utf8" | |
| }, | |
| { | |
| "start": 1350.84, | |
| "text": "encoding of these strings so what do we" | |
| }, | |
| { | |
| "start": 1353.559, | |
| "text": "do well the answer of course is we turn" | |
| }, | |
| { | |
| "start": 1355.48, | |
| "text": "to the bite pair encoding algorithm" | |
| }, | |
| { | |
| "start": 1357.44, | |
| "text": "which will allow us to compress these" | |
| }, | |
| { | |
| "start": 1359.08, | |
| "text": "bite sequences um to a variable amount" | |
| }, | |
| { | |
| "start": 1362.6, | |
| "text": "so we'll get to that in a bit but I just" | |
| }, | |
| { | |
| "start": 1364.679, | |
| "text": "want to briefly speak to the fact that I" | |
| }, | |
| { | |
| "start": 1367.12, | |
| "text": "would love nothing more than to be able" | |
| }, | |
| { | |
| "start": 1369.279, | |
| "text": "to feed raw bite sequences into uh" | |
| }, | |
| { | |
| "start": 1372.96, | |
| "text": "language models in fact there's a paper" | |
| }, | |
| { | |
| "start": 1374.88, | |
| "text": "about how this could potentially be done" | |
| }, | |
| { | |
| "start": 1377.08, | |
| "text": "uh from Summer last last year now the" | |
| }, | |
| { | |
| "start": 1379.279, | |
| "text": "problem is you actually have to go in" | |
| }, | |
| { | |
| "start": 1380.96, | |
| "text": "and you have to modify the Transformer" | |
| }, | |
| { | |
| "start": 1382.279, | |
| "text": "architecture because as I mentioned" | |
| }, | |
| { | |
| "start": 1384.48, | |
| "text": "you're going to have a problem where the" | |
| }, | |
| { | |
| "start": 1386.64, | |
| "text": "attention will start to become extremely" | |
| }, | |
| { | |
| "start": 1388.24, | |
| "text": "expensive because the sequences are so" | |
| }, | |
| { | |
| "start": 1390.36, | |
| "text": "long and so in this paper they propose" | |
| }, | |
| { | |
| "start": 1393.44, | |
| "text": "kind of a hierarchical structuring of" | |
| }, | |
| { | |
| "start": 1395.76, | |
| "text": "the Transformer that could allow you to" | |
| }, | |
| { | |
| "start": 1397.64, | |
| "text": "just feed in raw bites and so at the end" | |
| }, | |
| { | |
| "start": 1400.36, | |
| "text": "they say together these results" | |
| }, | |
| { | |
| "start": 1401.919, | |
| "text": "establish the viability of tokenization" | |
| }, | |
| { | |
| "start": 1403.64, | |
| "text": "free autor regressive sequence modeling" | |
| }, | |
| { | |
| "start": 1405.32, | |
| "text": "at scale so tokenization free would" | |
| }, | |
| { | |
| "start": 1407.4, | |
| "text": "indeed be amazing we would just feed B" | |
| }, | |
| { | |
| "start": 1410.279, | |
| "text": "streams directly into our models but" | |
| }, | |
| { | |
| "start": 1412.279, | |
| "text": "unfortunately I don't know that this has" | |
| }, | |
| { | |
| "start": 1414.159, | |
| "text": "really been proven out yet by" | |
| }, | |
| { | |
| "start": 1416.08, | |
| "text": "sufficiently many groups and a" | |
| }, | |
| { | |
| "start": 1417.24, | |
| "text": "sufficient scale uh but something like" | |
| }, | |
| { | |
| "start": 1419.24, | |
| "text": "this at one point would be amazing and I" | |
| }, | |
| { | |
| "start": 1420.679, | |
| "text": "hope someone comes up with it but for" | |
| }, | |
| { | |
| "start": 1422.32, | |
| "text": "now we have to come back and we can't" | |
| }, | |
| { | |
| "start": 1424.44, | |
| "text": "feed this directly into language models" | |
| }, | |
| { | |
| "start": 1426.44, | |
| "text": "and we have to compress it using the B" | |
| }, | |
| { | |
| "start": 1428.279, | |
| "text": "paare encoding algorithm so let's see" | |
| }, | |
| { | |
| "start": 1429.84, | |
| "text": "how that works so as I mentioned the B" | |
| }, | |
| { | |
| "start": 1431.64, | |
| "text": "paare encoding algorithm is not all that" | |
| }, | |
| { | |
| "start": 1433.52, | |
| "text": "complicated and the Wikipedia page is" | |
| }, | |
| { | |
| "start": 1435.52, | |
| "text": "actually quite instructive as far as the" | |
| }, | |
| { | |
| "start": 1437.159, | |
| "text": "basic idea goes go what we're doing is" | |
| }, | |
| { | |
| "start": 1439.48, | |
| "text": "we have some kind of a input sequence uh" | |
| }, | |
| { | |
| "start": 1441.76, | |
| "text": "like for example here we have only four" | |
| }, | |
| { | |
| "start": 1443.64, | |
| "text": "elements in our vocabulary a b c and d" | |
| }, | |
| { | |
| "start": 1446.32, | |
| "text": "and we have a sequence of them so" | |
| }, | |
| { | |
| "start": 1448.0, | |
| "text": "instead of bytes let's say we just have" | |
| }, | |
| { | |
| "start": 1449.76, | |
| "text": "four a vocab size of" | |
| }, | |
| { | |
| "start": 1452.039, | |
| "text": "four the sequence is too long and we'd" | |
| }, | |
| { | |
| "start": 1454.12, | |
| "text": "like to compress it so what we do is" | |
| }, | |
| { | |
| "start": 1456.159, | |
| "text": "that we iteratively find the pair of uh" | |
| }, | |
| { | |
| "start": 1460.159, | |
| "text": "tokens that occur the most" | |
| }, | |
| { | |
| "start": 1463.44, | |
| "text": "frequently and then once we've" | |
| }, | |
| { | |
| "start": 1465.279, | |
| "text": "identified that pair we repl replace" | |
| }, | |
| { | |
| "start": 1468.48, | |
| "text": "that pair with just a single new token" | |
| }, | |
| { | |
| "start": 1470.88, | |
| "text": "that we append to our vocabulary so for" | |
| }, | |
| { | |
| "start": 1473.559, | |
| "text": "example here the bite pair AA occurs" | |
| }, | |
| { | |
| "start": 1476.279, | |
| "text": "most often so we mint a new token let's" | |
| }, | |
| { | |
| "start": 1478.919, | |
| "text": "call it capital Z and we replace every" | |
| }, | |
| { | |
| "start": 1481.679, | |
| "text": "single occurrence of AA by Z so now we" | |
| }, | |
| { | |
| "start": 1486.0, | |
| "text": "have two Z's here so here we took a" | |
| }, | |
| { | |
| "start": 1488.919, | |
| "text": "sequence of 11 characters with" | |
| }, | |
| { | |
| "start": 1491.799, | |
| "text": "vocabulary size four and we've converted" | |
| }, | |
| { | |
| "start": 1494.44, | |
| "text": "it to a um sequence of only nine tokens" | |
| }, | |
| { | |
| "start": 1498.64, | |
| "text": "but now with a vocabulary of five" | |
| }, | |
| { | |
| "start": 1500.559, | |
| "text": "because we have a fifth vocabulary" | |
| }, | |
| { | |
| "start": 1502.399, | |
| "text": "element that we just created and it's Z" | |
| }, | |
| { | |
| "start": 1504.96, | |
| "text": "standing for concatination of AA and we" | |
| }, | |
| { | |
| "start": 1507.52, | |
| "text": "can again repeat this process so we" | |
| }, | |
| { | |
| "start": 1510.24, | |
| "text": "again look at the sequence and identify" | |
| }, | |
| { | |
| "start": 1512.88, | |
| "text": "the pair of tokens that are most" | |
| }, | |
| { | |
| "start": 1515.64, | |
| "text": "frequent let's say that that is now AB" | |
| }, | |
| { | |
| "start": 1519.159, | |
| "text": "well we are going to replace AB with a" | |
| }, | |
| { | |
| "start": 1520.76, | |
| "text": "new token that we meant call Y so y" | |
| }, | |
| { | |
| "start": 1523.76, | |
| "text": "becomes ab and then every single" | |
| }, | |
| { | |
| "start": 1525.24, | |
| "text": "occurrence of ab is now replaced with y" | |
| }, | |
| { | |
| "start": 1528.039, | |
| "text": "so we end up with this so now we only" | |
| }, | |
| { | |
| "start": 1531.44, | |
| "text": "have 1 2 3 4 5 6 seven characters in our" | |
| }, | |
| { | |
| "start": 1535.159, | |
| "text": "sequence but we have not just um four" | |
| }, | |
| { | |
| "start": 1540.12, | |
| "text": "vocabulary elements or five but now we" | |
| }, | |
| { | |
| "start": 1542.32, | |
| "text": "have six and for the final round we" | |
| }, | |
| { | |
| "start": 1545.799, | |
| "text": "again look through the sequence find" | |
| }, | |
| { | |
| "start": 1547.64, | |
| "text": "that the phrase zy or the pair zy is" | |
| }, | |
| { | |
| "start": 1550.559, | |
| "text": "most common and replace it one more time" | |
| }, | |
| { | |
| "start": 1553.32, | |
| "text": "with another um character let's say x so" | |
| }, | |
| { | |
| "start": 1556.64, | |
| "text": "X is z y and we replace all curses of zy" | |
| }, | |
| { | |
| "start": 1559.919, | |
| "text": "and we get this following sequence so" | |
| }, | |
| { | |
| "start": 1562.12, | |
| "text": "basically after we have gone through" | |
| }, | |
| { | |
| "start": 1563.6, | |
| "text": "this process instead of having a um" | |
| }, | |
| { | |
| "start": 1568.48, | |
| "text": "sequence of" | |
| }, | |
| { | |
| "start": 1569.76, | |
| "text": "11 uh tokens with a vocabulary length of" | |
| }, | |
| { | |
| "start": 1573.64, | |
| "text": "four we now have a sequence of 1 2 3" | |
| }, | |
| { | |
| "start": 1578.159, | |
| "text": "four five tokens but our vocabulary" | |
| }, | |
| { | |
| "start": 1581.48, | |
| "text": "length now is seven and so in this way" | |
| }, | |
| { | |
| "start": 1585.159, | |
| "text": "we can iteratively compress our sequence" | |
| }, | |
| { | |
| "start": 1587.44, | |
| "text": "I we Mint new tokens so in the in the" | |
| }, | |
| { | |
| "start": 1590.279, | |
| "text": "exact same way we start we start out" | |
| }, | |
| { | |
| "start": 1592.399, | |
| "text": "with bite sequences so we have 256" | |
| }, | |
| { | |
| "start": 1596.24, | |
| "text": "vocabulary size but we're now going to" | |
| }, | |
| { | |
| "start": 1598.2, | |
| "text": "go through these and find the bite pairs" | |
| }, | |
| { | |
| "start": 1600.64, | |
| "text": "that occur the most and we're going to" | |
| }, | |
| { | |
| "start": 1602.559, | |
| "text": "iteratively start minting new tokens" | |
| }, | |
| { | |
| "start": 1604.84, | |
| "text": "appending them to our vocabulary and" | |
| }, | |
| { | |
| "start": 1606.76, | |
| "text": "replacing things and in this way we're" | |
| }, | |
| { | |
| "start": 1608.88, | |
| "text": "going to end up with a compressed" | |
| }, | |
| { | |
| "start": 1610.24, | |
| "text": "training data set and also an algorithm" | |
| }, | |
| { | |
| "start": 1612.96, | |
| "text": "for taking any arbitrary sequence and" | |
| }, | |
| { | |
| "start": 1615.279, | |
| "text": "encoding it using this uh vocabul" | |
| }, | |
| { | |
| "start": 1618.24, | |
| "text": "and also decoding it back to Strings so" | |
| }, | |
| { | |
| "start": 1621.0, | |
| "text": "let's now Implement all that so here's" | |
| }, | |
| { | |
| "start": 1623.24, | |
| "text": "what I did I went to this block post" | |
| }, | |
| { | |
| "start": 1625.679, | |
| "text": "that I enjoyed and I took the first" | |
| }, | |
| { | |
| "start": 1627.32, | |
| "text": "paragraph and I copy pasted it here into" | |
| }, | |
| { | |
| "start": 1630.0, | |
| "text": "text so this is one very long line" | |
| }, | |
| { | |
| "start": 1633.279, | |
| "text": "here now to get the tokens as I" | |
| }, | |
| { | |
| "start": 1635.96, | |
| "text": "mentioned we just take our text and we" | |
| }, | |
| { | |
| "start": 1637.36, | |
| "text": "encode it into utf8 the tokens here at" | |
| }, | |
| { | |
| "start": 1640.159, | |
| "text": "this point will be a raw bites single" | |
| }, | |
| { | |
| "start": 1642.76, | |
| "text": "stream of bytes and just so that it's" | |
| }, | |
| { | |
| "start": 1645.6, | |
| "text": "easier to work with instead of just a" | |
| }, | |
| { | |
| "start": 1647.64, | |
| "text": "bytes object I'm going to convert all" | |
| }, | |
| { | |
| "start": 1649.96, | |
| "text": "those bytes to integers and then create" | |
| }, | |
| { | |
| "start": 1652.64, | |
| "text": "a list of it just so it's easier for us" | |
| }, | |
| { | |
| "start": 1654.279, | |
| "text": "to manipulate and work with in Python" | |
| }, | |
| { | |
| "start": 1655.88, | |
| "text": "and visualize and here I'm printing all" | |
| }, | |
| { | |
| "start": 1658.0, | |
| "text": "of that so this is the original um this" | |
| }, | |
| { | |
| "start": 1662.08, | |
| "text": "is the original paragraph and its length" | |
| }, | |
| { | |
| "start": 1665.0, | |
| "text": "is" | |
| }, | |
| { | |
| "start": 1665.799, | |
| "text": "533 uh code points and then here are the" | |
| }, | |
| { | |
| "start": 1669.799, | |
| "text": "bytes encoded in ut utf8 and we see that" | |
| }, | |
| { | |
| "start": 1673.32, | |
| "text": "this has a length of 616 bytes at this" | |
| }, | |
| { | |
| "start": 1676.32, | |
| "text": "point or 616 tokens and the reason this" | |
| }, | |
| { | |
| "start": 1679.039, | |
| "text": "is more is because a lot of these simple" | |
| }, | |
| { | |
| "start": 1681.84, | |
| "text": "asky characters or simple characters" | |
| }, | |
| { | |
| "start": 1684.6, | |
| "text": "they just become a single bite but a lot" | |
| }, | |
| { | |
| "start": 1686.44, | |
| "text": "of these Unicode more complex characters" | |
| }, | |
| { | |
| "start": 1688.76, | |
| "text": "become multiple bytes up to four and so" | |
| }, | |
| { | |
| "start": 1691.08, | |
| "text": "we are expanding that" | |
| }, | |
| { | |
| "start": 1692.76, | |
| "text": "size so now what we'd like to do as a" | |
| }, | |
| { | |
| "start": 1694.799, | |
| "text": "first step of the algorithm is we'd like" | |
| }, | |
| { | |
| "start": 1696.24, | |
| "text": "to iterate over here and find the pair" | |
| }, | |
| { | |
| "start": 1698.919, | |
| "text": "of bites that occur most frequently" | |
| }, | |
| { | |
| "start": 1702.0, | |
| "text": "because we're then going to merge it so" | |
| }, | |
| { | |
| "start": 1704.12, | |
| "text": "if you are working long on a notebook on" | |
| }, | |
| { | |
| "start": 1705.799, | |
| "text": "a side then I encourage you to basically" | |
| }, | |
| { | |
| "start": 1707.76, | |
| "text": "click on the link find this notebook and" | |
| }, | |
| { | |
| "start": 1709.919, | |
| "text": "try to write that function yourself" | |
| }, | |
| { | |
| "start": 1711.88, | |
| "text": "otherwise I'm going to come here and" | |
| }, | |
| { | |
| "start": 1712.96, | |
| "text": "Implement first the function that finds" | |
| }, | |
| { | |
| "start": 1714.96, | |
| "text": "the most common pair okay so here's what" | |
| }, | |
| { | |
| "start": 1716.919, | |
| "text": "I came up with there are many different" | |
| }, | |
| { | |
| "start": 1718.399, | |
| "text": "ways to implement this but I'm calling" | |
| }, | |
| { | |
| "start": 1720.32, | |
| "text": "the function get stats it expects a list" | |
| }, | |
| { | |
| "start": 1722.159, | |
| "text": "of integers I'm using a dictionary to" | |
| }, | |
| { | |
| "start": 1724.48, | |
| "text": "keep track of basically the counts and" | |
| }, | |
| { | |
| "start": 1726.88, | |
| "text": "then this is a pythonic way to iterate" | |
| }, | |
| { | |
| "start": 1728.84, | |
| "text": "consecutive elements of this list uh" | |
| }, | |
| { | |
| "start": 1731.44, | |
| "text": "which we covered in the previous video" | |
| }, | |
| { | |
| "start": 1733.72, | |
| "text": "and then here I'm just keeping track of" | |
| }, | |
| { | |
| "start": 1735.919, | |
| "text": "just incrementing by one um for all the" | |
| }, | |
| { | |
| "start": 1738.559, | |
| "text": "pairs so if I call this on all the" | |
| }, | |
| { | |
| "start": 1740.399, | |
| "text": "tokens here then the stats comes out" | |
| }, | |
| { | |
| "start": 1743.399, | |
| "text": "here so this is the dictionary the keys" | |
| }, | |
| { | |
| "start": 1746.159, | |
| "text": "are these topples of consecutive" | |
| }, | |
| { | |
| "start": 1748.919, | |
| "text": "elements and this is the count so just" | |
| }, | |
| { | |
| "start": 1751.6, | |
| "text": "to uh print it in a slightly better way" | |
| }, | |
| { | |
| "start": 1754.679, | |
| "text": "this is one way that I like to do that" | |
| }, | |
| { | |
| "start": 1757.6, | |
| "text": "where you it's a little bit compound" | |
| }, | |
| { | |
| "start": 1760.559, | |
| "text": "here so you can pause if you like but we" | |
| }, | |
| { | |
| "start": 1762.36, | |
| "text": "iterate all all the items the items" | |
| }, | |
| { | |
| "start": 1765.039, | |
| "text": "called on dictionary returns pairs of" | |
| }, | |
| { | |
| "start": 1767.399, | |
| "text": "key value and instead I create a list" | |
| }, | |
| { | |
| "start": 1771.799, | |
| "text": "here of value key because if it's a" | |
| }, | |
| { | |
| "start": 1775.12, | |
| "text": "value key list then I can call sort on" | |
| }, | |
| { | |
| "start": 1777.279, | |
| "text": "it and by default python will uh use the" | |
| }, | |
| { | |
| "start": 1781.36, | |
| "text": "first element which in this case will be" | |
| }, | |
| { | |
| "start": 1783.559, | |
| "text": "value to sort by if it's given tles and" | |
| }, | |
| { | |
| "start": 1786.64, | |
| "text": "then reverse so it's descending and" | |
| }, | |
| { | |
| "start": 1788.72, | |
| "text": "print that so basically it looks like" | |
| }, | |
| { | |
| "start": 1790.88, | |
| "text": "101 comma 32 was the most commonly" | |
| }, | |
| { | |
| "start": 1793.96, | |
| "text": "occurring consecutive pair and it" | |
| }, | |
| { | |
| "start": 1795.72, | |
| "text": "occurred 20 times we can double check" | |
| }, | |
| { | |
| "start": 1798.2, | |
| "text": "that that makes reasonable sense so if I" | |
| }, | |
| { | |
| "start": 1800.44, | |
| "text": "just search" | |
| }, | |
| { | |
| "start": 1802.08, | |
| "text": "10132 then you see that these are the 20" | |
| }, | |
| { | |
| "start": 1805.2, | |
| "text": "occurrences of that um pair and if we'd" | |
| }, | |
| { | |
| "start": 1810.12, | |
| "text": "like to take a look at what exactly that" | |
| }, | |
| { | |
| "start": 1811.519, | |
| "text": "pair is we can use Char which is the" | |
| }, | |
| { | |
| "start": 1814.279, | |
| "text": "opposite of or in Python so we give it a" | |
| }, | |
| { | |
| "start": 1817.84, | |
| "text": "um unic code Cod point so 101 and of 32" | |
| }, | |
| { | |
| "start": 1822.039, | |
| "text": "and we see that this is e and space so" | |
| }, | |
| { | |
| "start": 1825.0, | |
| "text": "basically there's a lot of E space here" | |
| }, | |
| { | |
| "start": 1828.08, | |
| "text": "meaning that a lot of these words seem" | |
| }, | |
| { | |
| "start": 1829.48, | |
| "text": "to end with e so here's eace as an" | |
| }, | |
| { | |
| "start": 1832.12, | |
| "text": "example so there's a lot of that going" | |
| }, | |
| { | |
| "start": 1834.039, | |
| "text": "on here and this is the most common pair" | |
| }, | |
| { | |
| "start": 1836.72, | |
| "text": "so now that we've identified the most" | |
| }, | |
| { | |
| "start": 1838.24, | |
| "text": "common pair we would like to iterate" | |
| }, | |
| { | |
| "start": 1840.36, | |
| "text": "over this sequence we're going to Mint a" | |
| }, | |
| { | |
| "start": 1842.679, | |
| "text": "new token with the ID of" | |
| }, | |
| { | |
| "start": 1844.799, | |
| "text": "256 right because these tokens currently" | |
| }, | |
| { | |
| "start": 1847.84, | |
| "text": "go from Z to 255 so when we create a new" | |
| }, | |
| { | |
| "start": 1850.64, | |
| "text": "token it will have an ID of" | |
| }, | |
| { | |
| "start": 1852.84, | |
| "text": "256 and we're going to iterate over this" | |
| }, | |
| { | |
| "start": 1856.0, | |
| "text": "entire um list and every every time we" | |
| }, | |
| { | |
| "start": 1859.84, | |
| "text": "see 101 comma 32 we're going to swap" | |
| }, | |
| { | |
| "start": 1862.72, | |
| "text": "that out for" | |
| }, | |
| { | |
| "start": 1863.919, | |
| "text": "256 so let's Implement that now and feel" | |
| }, | |
| { | |
| "start": 1867.24, | |
| "text": "free to uh do that yourself as well so" | |
| }, | |
| { | |
| "start": 1869.96, | |
| "text": "first I commented uh this just so we" | |
| }, | |
| { | |
| "start": 1871.96, | |
| "text": "don't pollute uh the notebook too much" | |
| }, | |
| { | |
| "start": 1874.96, | |
| "text": "this is a nice way of in Python" | |
| }, | |
| { | |
| "start": 1877.96, | |
| "text": "obtaining the highest ranking pair so" | |
| }, | |
| { | |
| "start": 1880.399, | |
| "text": "we're basically calling the Max on this" | |
| }, | |
| { | |
| "start": 1883.08, | |
| "text": "dictionary stats and this will return" | |
| }, | |
| { | |
| "start": 1886.32, | |
| "text": "the maximum" | |
| }, | |
| { | |
| "start": 1887.679, | |
| "text": "key and then the question is how does it" | |
| }, | |
| { | |
| "start": 1890.159, | |
| "text": "rank keys so you can provide it with a" | |
| }, | |
| { | |
| "start": 1892.84, | |
| "text": "function that ranks keys and that" | |
| }, | |
| { | |
| "start": 1895.2, | |
| "text": "function is just stats. getet uh stats." | |
| }, | |
| { | |
| "start": 1898.2, | |
| "text": "getet would basically return the value" | |
| }, | |
| { | |
| "start": 1901.12, | |
| "text": "and so we're ranking by the value and" | |
| }, | |
| { | |
| "start": 1902.799, | |
| "text": "getting the maximum key so it's 101" | |
| }, | |
| { | |
| "start": 1905.48, | |
| "text": "comma 32 as we saw now to actually merge" | |
| }, | |
| { | |
| "start": 1909.2, | |
| "text": "10132 um this is the function that I" | |
| }, | |
| { | |
| "start": 1911.88, | |
| "text": "wrote but again there are many different" | |
| }, | |
| { | |
| "start": 1913.279, | |
| "text": "versions of it so we're going to take a" | |
| }, | |
| { | |
| "start": 1915.72, | |
| "text": "list of IDs and the the pair that we" | |
| }, | |
| { | |
| "start": 1917.72, | |
| "text": "want to replace and that pair will be" | |
| }, | |
| { | |
| "start": 1919.76, | |
| "text": "replaced with the new index" | |
| }, | |
| { | |
| "start": 1922.24, | |
| "text": "idx so iterating through IDs if we find" | |
| }, | |
| { | |
| "start": 1925.559, | |
| "text": "the pair swap it out for idx so we" | |
| }, | |
| { | |
| "start": 1928.44, | |
| "text": "create this new list and then we start" | |
| }, | |
| { | |
| "start": 1930.519, | |
| "text": "at zero and then we go through this" | |
| }, | |
| { | |
| "start": 1932.76, | |
| "text": "entire list sequentially from left to" | |
| }, | |
| { | |
| "start": 1934.84, | |
| "text": "right and here we are checking for" | |
| }, | |
| { | |
| "start": 1937.12, | |
| "text": "equality at the current position with" | |
| }, | |
| { | |
| "start": 1939.639, | |
| "text": "the" | |
| }, | |
| { | |
| "start": 1940.88, | |
| "text": "pair um so here we are checking that the" | |
| }, | |
| { | |
| "start": 1943.399, | |
| "text": "pair matches now here is a bit of a" | |
| }, | |
| { | |
| "start": 1945.48, | |
| "text": "tricky condition that you have to append" | |
| }, | |
| { | |
| "start": 1947.24, | |
| "text": "if you're trying to be careful and that" | |
| }, | |
| { | |
| "start": 1949.08, | |
| "text": "is that um you don't want this here to" | |
| }, | |
| { | |
| "start": 1951.679, | |
| "text": "be out of Bounds at the very last" | |
| }, | |
| { | |
| "start": 1953.76, | |
| "text": "position when you're on the rightmost" | |
| }, | |
| { | |
| "start": 1955.399, | |
| "text": "element of this list otherwise this" | |
| }, | |
| { | |
| "start": 1957.12, | |
| "text": "would uh give you an autof bounds error" | |
| }, | |
| { | |
| "start": 1959.279, | |
| "text": "so we have to make sure that we're not" | |
| }, | |
| { | |
| "start": 1960.679, | |
| "text": "at the very very last element so uh this" | |
| }, | |
| { | |
| "start": 1964.039, | |
| "text": "would be false for that so if we find a" | |
| }, | |
| { | |
| "start": 1966.6, | |
| "text": "match we append to this new list that" | |
| }, | |
| { | |
| "start": 1971.08, | |
| "text": "replacement index and we increment the" | |
| }, | |
| { | |
| "start": 1973.32, | |
| "text": "position by two so we skip over that" | |
| }, | |
| { | |
| "start": 1974.799, | |
| "text": "entire pair but otherwise if we we" | |
| }, | |
| { | |
| "start": 1977.12, | |
| "text": "haven't found a matching pair we just" | |
| }, | |
| { | |
| "start": 1979.08, | |
| "text": "sort of copy over the um element at that" | |
| }, | |
| { | |
| "start": 1982.12, | |
| "text": "position and increment by one then" | |
| }, | |
| { | |
| "start": 1985.24, | |
| "text": "return this so here's a very small toy" | |
| }, | |
| { | |
| "start": 1987.36, | |
| "text": "example if we have a list 566 791 and we" | |
| }, | |
| { | |
| "start": 1990.36, | |
| "text": "want to replace the occurrences of 67" | |
| }, | |
| { | |
| "start": 1992.36, | |
| "text": "with 99 then calling this on that will" | |
| }, | |
| { | |
| "start": 1996.36, | |
| "text": "give us what we're asking for so here" | |
| }, | |
| { | |
| "start": 1998.919, | |
| "text": "the 67 is replaced with" | |
| }, | |
| { | |
| "start": 2001.519, | |
| "text": "99 so now I'm going to uncomment this" | |
| }, | |
| { | |
| "start": 2003.76, | |
| "text": "for our actual use case where we want to" | |
| }, | |
| { | |
| "start": 2007.279, | |
| "text": "take our tokens we want to take the top" | |
| }, | |
| { | |
| "start": 2009.519, | |
| "text": "pair here and replace it with 256 to get" | |
| }, | |
| { | |
| "start": 2013.12, | |
| "text": "tokens to if we run this we get the" | |
| }, | |
| { | |
| "start": 2017.24, | |
| "text": "following so recall that previously we" | |
| }, | |
| { | |
| "start": 2020.88, | |
| "text": "had a length 616 in this list and now we" | |
| }, | |
| { | |
| "start": 2025.12, | |
| "text": "have a length 596 right so this" | |
| }, | |
| { | |
| "start": 2028.44, | |
| "text": "decreased by 20 which makes sense" | |
| }, | |
| { | |
| "start": 2030.159, | |
| "text": "because there are 20 occurrences" | |
| }, | |
| { | |
| "start": 2032.36, | |
| "text": "moreover we can try to find 256 here and" | |
| }, | |
| { | |
| "start": 2035.48, | |
| "text": "we see plenty of occurrences on off it" | |
| }, | |
| { | |
| "start": 2038.44, | |
| "text": "and moreover just double check there" | |
| }, | |
| { | |
| "start": 2039.76, | |
| "text": "should be no occurrence of 10132 so this" | |
| }, | |
| { | |
| "start": 2042.519, | |
| "text": "is the original array plenty of them and" | |
| }, | |
| { | |
| "start": 2045.0, | |
| "text": "in the second array there are no" | |
| }, | |
| { | |
| "start": 2046.159, | |
| "text": "occurrences of 1032 so we've" | |
| }, | |
| { | |
| "start": 2048.52, | |
| "text": "successfully merged this single pair and" | |
| }, | |
| { | |
| "start": 2051.599, | |
| "text": "now we just uh iterate this so we are" | |
| }, | |
| { | |
| "start": 2053.919, | |
| "text": "going to go over the sequence again find" | |
| }, | |
| { | |
| "start": 2055.48, | |
| "text": "the most common pair and replace it so" | |
| }, | |
| { | |
| "start": 2057.8, | |
| "text": "let me now write a y Loop that uses" | |
| }, | |
| { | |
| "start": 2059.48, | |
| "text": "these functions to do this um sort of" | |
| }, | |
| { | |
| "start": 2061.8, | |
| "text": "iteratively and how many times do we do" | |
| }, | |
| { | |
| "start": 2064.28, | |
| "text": "it four well that's totally up to us as" | |
| }, | |
| { | |
| "start": 2066.28, | |
| "text": "a hyper parameter" | |
| }, | |
| { | |
| "start": 2067.399, | |
| "text": "the more um steps we take the larger" | |
| }, | |
| { | |
| "start": 2070.919, | |
| "text": "will be our vocabulary and the shorter" | |
| }, | |
| { | |
| "start": 2073.04, | |
| "text": "will be our sequence and there is some" | |
| }, | |
| { | |
| "start": 2075.119, | |
| "text": "sweet spot that we usually find works" | |
| }, | |
| { | |
| "start": 2077.24, | |
| "text": "the best in practice and so this is kind" | |
| }, | |
| { | |
| "start": 2079.919, | |
| "text": "of a hyperparameter and we tune it and" | |
| }, | |
| { | |
| "start": 2081.639, | |
| "text": "we find good vocabulary sizes as an" | |
| }, | |
| { | |
| "start": 2084.2, | |
| "text": "example gp4 currently uses roughly" | |
| }, | |
| { | |
| "start": 2086.0, | |
| "text": "100,000 tokens and um bpark that those" | |
| }, | |
| { | |
| "start": 2089.879, | |
| "text": "are reasonable numbers currently instead" | |
| }, | |
| { | |
| "start": 2091.8, | |
| "text": "the are large language models so let me" | |
| }, | |
| { | |
| "start": 2093.919, | |
| "text": "now write uh putting putting it all" | |
| }, | |
| { | |
| "start": 2095.96, | |
| "text": "together and uh iterating these steps" | |
| }, | |
| { | |
| "start": 2098.68, | |
| "text": "okay now before we dive into the Y loop" | |
| }, | |
| { | |
| "start": 2100.52, | |
| "text": "I wanted to add one more cell here where" | |
| }, | |
| { | |
| "start": 2103.28, | |
| "text": "I went to the block post and instead of" | |
| }, | |
| { | |
| "start": 2104.96, | |
| "text": "grabbing just the first paragraph or two" | |
| }, | |
| { | |
| "start": 2107.0, | |
| "text": "I took the entire block post and I" | |
| }, | |
| { | |
| "start": 2108.8, | |
| "text": "stretched it out in a single line and" | |
| }, | |
| { | |
| "start": 2110.96, | |
| "text": "basically just using longer text will" | |
| }, | |
| { | |
| "start": 2112.48, | |
| "text": "allow us to have more representative" | |
| }, | |
| { | |
| "start": 2113.88, | |
| "text": "statistics for the bite Pairs and we'll" | |
| }, | |
| { | |
| "start": 2116.28, | |
| "text": "just get a more sensible results out of" | |
| }, | |
| { | |
| "start": 2118.04, | |
| "text": "it because it's longer text um so here" | |
| }, | |
| { | |
| "start": 2121.76, | |
| "text": "we have the raw text we encode it into" | |
| }, | |
| { | |
| "start": 2124.359, | |
| "text": "bytes using the utf8 encoding" | |
| }, | |
| { | |
| "start": 2127.64, | |
| "text": "and then here as before we are just" | |
| }, | |
| { | |
| "start": 2130.079, | |
| "text": "changing it into a list of integers in" | |
| }, | |
| { | |
| "start": 2131.839, | |
| "text": "Python just so it's easier to work with" | |
| }, | |
| { | |
| "start": 2133.96, | |
| "text": "instead of the raw byes objects and then" | |
| }, | |
| { | |
| "start": 2136.68, | |
| "text": "this is the code that I came up with uh" | |
| }, | |
| { | |
| "start": 2140.76, | |
| "text": "to actually do the merging in Loop these" | |
| }, | |
| { | |
| "start": 2144.0, | |
| "text": "two functions here are identical to what" | |
| }, | |
| { | |
| "start": 2145.839, | |
| "text": "we had above I only included them here" | |
| }, | |
| { | |
| "start": 2148.119, | |
| "text": "just so that you have the point of" | |
| }, | |
| { | |
| "start": 2149.88, | |
| "text": "reference here so uh these two are" | |
| }, | |
| { | |
| "start": 2153.359, | |
| "text": "identical and then this is the new code" | |
| }, | |
| { | |
| "start": 2155.0, | |
| "text": "that I added so the first first thing we" | |
| }, | |
| { | |
| "start": 2157.079, | |
| "text": "want to do is we want to decide on the" | |
| }, | |
| { | |
| "start": 2158.56, | |
| "text": "final vocabulary size that we want our" | |
| }, | |
| { | |
| "start": 2161.04, | |
| "text": "tokenizer to have and as I mentioned" | |
| }, | |
| { | |
| "start": 2162.96, | |
| "text": "this is a hyper parameter and you set it" | |
| }, | |
| { | |
| "start": 2164.52, | |
| "text": "in some way depending on your best" | |
| }, | |
| { | |
| "start": 2166.44, | |
| "text": "performance so let's say for us we're" | |
| }, | |
| { | |
| "start": 2168.48, | |
| "text": "going to use 276 because that way we're" | |
| }, | |
| { | |
| "start": 2170.839, | |
| "text": "going to be doing exactly 20" | |
| }, | |
| { | |
| "start": 2173.079, | |
| "text": "merges and uh 20 merges because we" | |
| }, | |
| { | |
| "start": 2175.72, | |
| "text": "already have" | |
| }, | |
| { | |
| "start": 2176.88, | |
| "text": "256 tokens for the raw bytes and to" | |
| }, | |
| { | |
| "start": 2180.88, | |
| "text": "reach 276 we have to do 20 merges uh to" | |
| }, | |
| { | |
| "start": 2183.68, | |
| "text": "add 20 new" | |
| }, | |
| { | |
| "start": 2185.48, | |
| "text": "tokens here uh this is uh one way in" | |
| }, | |
| { | |
| "start": 2188.2, | |
| "text": "Python to just create a copy of a list" | |
| }, | |
| { | |
| "start": 2191.48, | |
| "text": "so I'm taking the tokens list and by" | |
| }, | |
| { | |
| "start": 2193.52, | |
| "text": "wrapping it in a list python will" | |
| }, | |
| { | |
| "start": 2195.839, | |
| "text": "construct a new list of all the" | |
| }, | |
| { | |
| "start": 2197.16, | |
| "text": "individual elements so this is just a" | |
| }, | |
| { | |
| "start": 2198.64, | |
| "text": "copy" | |
| }, | |
| { | |
| "start": 2199.92, | |
| "text": "operation then here I'm creating a" | |
| }, | |
| { | |
| "start": 2202.079, | |
| "text": "merges uh dictionary so this merges" | |
| }, | |
| { | |
| "start": 2204.839, | |
| "text": "dictionary is going to maintain" | |
| }, | |
| { | |
| "start": 2206.119, | |
| "text": "basically the child one child two" | |
| }, | |
| { | |
| "start": 2209.4, | |
| "text": "mapping to a new uh token and so what" | |
| }, | |
| { | |
| "start": 2212.52, | |
| "text": "we're going to be building up here is a" | |
| }, | |
| { | |
| "start": 2213.92, | |
| "text": "binary tree of merges but actually it's" | |
| }, | |
| { | |
| "start": 2216.92, | |
| "text": "not exactly a tree because a tree would" | |
| }, | |
| { | |
| "start": 2219.28, | |
| "text": "have a single root node with a bunch of" | |
| }, | |
| { | |
| "start": 2221.44, | |
| "text": "leaves for us we're starting with the" | |
| }, | |
| { | |
| "start": 2223.44, | |
| "text": "leaves on the bottom which are the" | |
| }, | |
| { | |
| "start": 2225.0, | |
| "text": "individual bites those are the starting" | |
| }, | |
| { | |
| "start": 2226.92, | |
| "text": "256 tokens and then we're starting to" | |
| }, | |
| { | |
| "start": 2229.52, | |
| "text": "like merge two of them at a time and so" | |
| }, | |
| { | |
| "start": 2231.52, | |
| "text": "it's not a tree it's more like a forest" | |
| }, | |
| { | |
| "start": 2234.96, | |
| "text": "um uh as we merge these elements" | |
| }, | |
| { | |
| "start": 2238.92, | |
| "text": "so for 20 merges we're going to find the" | |
| }, | |
| { | |
| "start": 2242.88, | |
| "text": "most commonly occurring pair we're going" | |
| }, | |
| { | |
| "start": 2245.079, | |
| "text": "to Mint a new token integer for it so I" | |
| }, | |
| { | |
| "start": 2248.48, | |
| "text": "here will start at zero so we'll going" | |
| }, | |
| { | |
| "start": 2250.079, | |
| "text": "to start at 256 we're going to print" | |
| }, | |
| { | |
| "start": 2252.359, | |
| "text": "that we're merging it and we're going to" | |
| }, | |
| { | |
| "start": 2254.44, | |
| "text": "replace all of the occurrences of that" | |
| }, | |
| { | |
| "start": 2256.2, | |
| "text": "pair with the new new lied token and" | |
| }, | |
| { | |
| "start": 2259.56, | |
| "text": "we're going to record that this pair of" | |
| }, | |
| { | |
| "start": 2262.16, | |
| "text": "integers merged into this new" | |
| }, | |
| { | |
| "start": 2265.52, | |
| "text": "integer so running this gives us the" | |
| }, | |
| { | |
| "start": 2269.079, | |
| "text": "following" | |
| }, | |
| { | |
| "start": 2271.16, | |
| "text": "output so we did 20 merges and for" | |
| }, | |
| { | |
| "start": 2274.48, | |
| "text": "example the first merge was exactly as" | |
| }, | |
| { | |
| "start": 2276.839, | |
| "text": "before the" | |
| }, | |
| { | |
| "start": 2278.839, | |
| "text": "10132 um tokens merging into a new token" | |
| }, | |
| { | |
| "start": 2281.8, | |
| "text": "2556 now keep in mind that the" | |
| }, | |
| { | |
| "start": 2284.0, | |
| "text": "individual uh tokens 101 and 32 can" | |
| }, | |
| { | |
| "start": 2286.599, | |
| "text": "still occur in the sequence after" | |
| }, | |
| { | |
| "start": 2288.44, | |
| "text": "merging it's only when they occur" | |
| }, | |
| { | |
| "start": 2290.359, | |
| "text": "exactly consecutively that that becomes" | |
| }, | |
| { | |
| "start": 2292.599, | |
| "text": "256" | |
| }, | |
| { | |
| "start": 2293.88, | |
| "text": "now um and in particular the other thing" | |
| }, | |
| { | |
| "start": 2296.92, | |
| "text": "to notice here is that the token 256" | |
| }, | |
| { | |
| "start": 2299.16, | |
| "text": "which is the newly minted token is also" | |
| }, | |
| { | |
| "start": 2301.4, | |
| "text": "eligible for merging so here on the" | |
| }, | |
| { | |
| "start": 2303.4, | |
| "text": "bottom the 20th merge was a merge of 25" | |
| }, | |
| { | |
| "start": 2306.839, | |
| "text": "and 259 becoming" | |
| }, | |
| { | |
| "start": 2308.88, | |
| "text": "275 so every time we replace these" | |
| }, | |
| { | |
| "start": 2311.8, | |
| "text": "tokens they become eligible for merging" | |
| }, | |
| { | |
| "start": 2313.64, | |
| "text": "in the next round of data ration so" | |
| }, | |
| { | |
| "start": 2315.92, | |
| "text": "that's why we're building up a small" | |
| }, | |
| { | |
| "start": 2317.119, | |
| "text": "sort of binary Forest instead of a" | |
| }, | |
| { | |
| "start": 2318.8, | |
| "text": "single individual" | |
| }, | |
| { | |
| "start": 2320.2, | |
| "text": "tree one thing we can take a look at as" | |
| }, | |
| { | |
| "start": 2322.319, | |
| "text": "well is we can take a look at the" | |
| }, | |
| { | |
| "start": 2324.0, | |
| "text": "compression ratio that we've achieved so" | |
| }, | |
| { | |
| "start": 2326.16, | |
| "text": "in particular we started off with this" | |
| }, | |
| { | |
| "start": 2328.359, | |
| "text": "tokens list um so we started off with" | |
| }, | |
| { | |
| "start": 2331.4, | |
| "text": "24,000 bytes and after merging 20 times" | |
| }, | |
| { | |
| "start": 2336.28, | |
| "text": "uh we now have only" | |
| }, | |
| { | |
| "start": 2338.52, | |
| "text": "19,000 um tokens and so therefore the" | |
| }, | |
| { | |
| "start": 2341.92, | |
| "text": "compression ratio simply just dividing" | |
| }, | |
| { | |
| "start": 2343.64, | |
| "text": "the two is roughly 1.27 so that's the" | |
| }, | |
| { | |
| "start": 2346.8, | |
| "text": "amount of compression we were able to" | |
| }, | |
| { | |
| "start": 2347.96, | |
| "text": "achieve of this text with only 20" | |
| }, | |
| { | |
| "start": 2350.8, | |
| "text": "merges um and of course the more" | |
| }, | |
| { | |
| "start": 2353.119, | |
| "text": "vocabulary elements you add uh the" | |
| }, | |
| { | |
| "start": 2355.599, | |
| "text": "greater the compression ratio here would" | |
| }, | |
| { | |
| "start": 2359.24, | |
| "text": "be finally so that's kind of like um the" | |
| }, | |
| { | |
| "start": 2363.76, | |
| "text": "training of the tokenizer if you will" | |
| }, | |
| { | |
| "start": 2365.72, | |
| "text": "now 1 Point I wanted to make is that and" | |
| }, | |
| { | |
| "start": 2368.28, | |
| "text": "maybe this is a diagram that can help um" | |
| }, | |
| { | |
| "start": 2371.28, | |
| "text": "kind of illustrate is that tokenizer is" | |
| }, | |
| { | |
| "start": 2373.079, | |
| "text": "a completely separate object from the" | |
| }, | |
| { | |
| "start": 2374.92, | |
| "text": "large language model itself so" | |
| }, | |
| { | |
| "start": 2377.0, | |
| "text": "everything in this lecture we're not" | |
| }, | |
| { | |
| "start": 2378.04, | |
| "text": "really touching the llm itself uh we're" | |
| }, | |
| { | |
| "start": 2380.119, | |
| "text": "just training the tokenizer this is a" | |
| }, | |
| { | |
| "start": 2381.839, | |
| "text": "completely separate pre-processing stage" | |
| }, | |
| { | |
| "start": 2383.92, | |
| "text": "usually so the tokenizer will have its" | |
| }, | |
| { | |
| "start": 2386.24, | |
| "text": "own training set just like a large" | |
| }, | |
| { | |
| "start": 2387.96, | |
| "text": "language model has a potentially" | |
| }, | |
| { | |
| "start": 2389.8, | |
| "text": "different training set so the tokenizer" | |
| }, | |
| { | |
| "start": 2392.04, | |
| "text": "has a training set of documents on which" | |
| }, | |
| { | |
| "start": 2393.4, | |
| "text": "you're going to train the" | |
| }, | |
| { | |
| "start": 2394.76, | |
| "text": "tokenizer and then and um we're" | |
| }, | |
| { | |
| "start": 2397.76, | |
| "text": "performing The Bite pair encoding" | |
| }, | |
| { | |
| "start": 2398.96, | |
| "text": "algorithm as we saw above to train the" | |
| }, | |
| { | |
| "start": 2401.079, | |
| "text": "vocabulary of this" | |
| }, | |
| { | |
| "start": 2402.64, | |
| "text": "tokenizer so it has its own training set" | |
| }, | |
| { | |
| "start": 2404.96, | |
| "text": "it is a pre-processing stage that you" | |
| }, | |
| { | |
| "start": 2406.52, | |
| "text": "would run a single time in the beginning" | |
| }, | |
| { | |
| "start": 2409.24, | |
| "text": "um and the tokenizer is trained using" | |
| }, | |
| { | |
| "start": 2411.96, | |
| "text": "bipar coding algorithm once you have the" | |
| }, | |
| { | |
| "start": 2414.359, | |
| "text": "tokenizer once it's trained and you have" | |
| }, | |
| { | |
| "start": 2416.319, | |
| "text": "the vocabulary and you have the merges" | |
| }, | |
| { | |
| "start": 2419.04, | |
| "text": "uh we can do both encoding and decoding" | |
| }, | |
| { | |
| "start": 2422.28, | |
| "text": "so these two arrows here so the" | |
| }, | |
| { | |
| "start": 2424.52, | |
| "text": "tokenizer is a translation layer between" | |
| }, | |
| { | |
| "start": 2427.0, | |
| "text": "raw text which is as we saw the sequence" | |
| }, | |
| { | |
| "start": 2430.04, | |
| "text": "of Unicode code points it can take raw" | |
| }, | |
| { | |
| "start": 2432.52, | |
| "text": "text and turn it into a token sequence" | |
| }, | |
| { | |
| "start": 2435.44, | |
| "text": "and vice versa it can take a token" | |
| }, | |
| { | |
| "start": 2437.0, | |
| "text": "sequence and translate it back into raw" | |
| }, | |
| { | |
| "start": 2440.76, | |
| "text": "text so now that we have trained uh" | |
| }, | |
| { | |
| "start": 2443.359, | |
| "text": "tokenizer and we have these merges we" | |
| }, | |
| { | |
| "start": 2445.96, | |
| "text": "are going to turn to how we can do the" | |
| }, | |
| { | |
| "start": 2447.44, | |
| "text": "encoding and the decoding step if you" | |
| }, | |
| { | |
| "start": 2449.48, | |
| "text": "give me text here are the tokens and" | |
| }, | |
| { | |
| "start": 2451.24, | |
| "text": "vice versa if you give me tokens here's" | |
| }, | |
| { | |
| "start": 2453.0, | |
| "text": "the text once we have that we can" | |
| }, | |
| { | |
| "start": 2455.28, | |
| "text": "translate between these two Realms and" | |
| }, | |
| { | |
| "start": 2457.52, | |
| "text": "then the language model is going to be" | |
| }, | |
| { | |
| "start": 2458.76, | |
| "text": "trained as a step two afterwards and" | |
| }, | |
| { | |
| "start": 2461.64, | |
| "text": "typically in a in a sort of a" | |
| }, | |
| { | |
| "start": 2463.64, | |
| "text": "state-of-the-art application you might" | |
| }, | |
| { | |
| "start": 2465.48, | |
| "text": "take all of your training data for the" | |
| }, | |
| { | |
| "start": 2466.839, | |
| "text": "language model and you might run it" | |
| }, | |
| { | |
| "start": 2468.359, | |
| "text": "through the tokenizer and sort of" | |
| }, | |
| { | |
| "start": 2470.4, | |
| "text": "translate everything into a massive" | |
| }, | |
| { | |
| "start": 2471.92, | |
| "text": "token sequence and then you can throw" | |
| }, | |
| { | |
| "start": 2473.64, | |
| "text": "away the raw text you're just left with" | |
| }, | |
| { | |
| "start": 2475.44, | |
| "text": "the tokens themselves and those are" | |
| }, | |
| { | |
| "start": 2477.72, | |
| "text": "stored on disk and that is what the" | |
| }, | |
| { | |
| "start": 2479.72, | |
| "text": "large language model is actually reading" | |
| }, | |
| { | |
| "start": 2481.319, | |
| "text": "when it's training on them so this one" | |
| }, | |
| { | |
| "start": 2483.24, | |
| "text": "approach that you can take as a single" | |
| }, | |
| { | |
| "start": 2484.8, | |
| "text": "massive pre-processing step a" | |
| }, | |
| { | |
| "start": 2486.88, | |
| "text": "stage um so yeah basically I think the" | |
| }, | |
| { | |
| "start": 2490.4, | |
| "text": "most important thing I want to get" | |
| }, | |
| { | |
| "start": 2491.4, | |
| "text": "across is that this is completely" | |
| }, | |
| { | |
| "start": 2492.599, | |
| "text": "separate stage it usually has its own" | |
| }, | |
| { | |
| "start": 2494.4, | |
| "text": "entire uh training set you may want to" | |
| }, | |
| { | |
| "start": 2496.839, | |
| "text": "have those training sets be different" | |
| }, | |
| { | |
| "start": 2498.359, | |
| "text": "between the tokenizer and the logge" | |
| }, | |
| { | |
| "start": 2499.599, | |
| "text": "language model so for example when" | |
| }, | |
| { | |
| "start": 2501.28, | |
| "text": "you're training the tokenizer as I" | |
| }, | |
| { | |
| "start": 2503.319, | |
| "text": "mentioned we don't just care about the" | |
| }, | |
| { | |
| "start": 2505.079, | |
| "text": "performance of English text we care" | |
| }, | |
| { | |
| "start": 2506.76, | |
| "text": "about uh multi many different languages" | |
| }, | |
| { | |
| "start": 2509.44, | |
| "text": "and we also care about code or not code" | |
| }, | |
| { | |
| "start": 2511.52, | |
| "text": "so you may want to look into different" | |
| }, | |
| { | |
| "start": 2513.24, | |
| "text": "kinds of mixtures of different kinds of" | |
| }, | |
| { | |
| "start": 2515.2, | |
| "text": "languages and different amounts of code" | |
| }, | |
| { | |
| "start": 2517.359, | |
| "text": "and things like that because the amount" | |
| }, | |
| { | |
| "start": 2520.24, | |
| "text": "of different language that you have in" | |
| }, | |
| { | |
| "start": 2521.96, | |
| "text": "your tokenizer training set will" | |
| }, | |
| { | |
| "start": 2523.76, | |
| "text": "determine how many merges of it there" | |
| }, | |
| { | |
| "start": 2526.119, | |
| "text": "will be and therefore that determines" | |
| }, | |
| { | |
| "start": 2528.24, | |
| "text": "the density with which uh this type of" | |
| }, | |
| { | |
| "start": 2531.319, | |
| "text": "data is um sort of has in the token" | |
| }, | |
| { | |
| "start": 2535.2, | |
| "text": "space and so roughly speaking" | |
| }, | |
| { | |
| "start": 2537.76, | |
| "text": "intuitively if you add some amount of" | |
| }, | |
| { | |
| "start": 2539.72, | |
| "text": "data like say you have a ton of Japanese" | |
| }, | |
| { | |
| "start": 2541.359, | |
| "text": "data in your uh tokenizer training set" | |
| }, | |
| { | |
| "start": 2544.04, | |
| "text": "then that means that more Japanese" | |
| }, | |
| { | |
| "start": 2545.359, | |
| "text": "tokens will get merged" | |
| }, | |
| { | |
| "start": 2546.839, | |
| "text": "and therefore Japanese will have shorter" | |
| }, | |
| { | |
| "start": 2548.92, | |
| "text": "sequences uh and that's going to be" | |
| }, | |
| { | |
| "start": 2550.64, | |
| "text": "beneficial for the large language model" | |
| }, | |
| { | |
| "start": 2552.4, | |
| "text": "which has a finite context length on" | |
| }, | |
| { | |
| "start": 2554.359, | |
| "text": "which it can work on in in the token" | |
| }, | |
| { | |
| "start": 2556.599, | |
| "text": "space uh so hopefully that makes sense" | |
| }, | |
| { | |
| "start": 2559.24, | |
| "text": "so we're now going to turn to encoding" | |
| }, | |
| { | |
| "start": 2561.2, | |
| "text": "and decoding now that we have trained a" | |
| }, | |
| { | |
| "start": 2563.079, | |
| "text": "tokenizer so we have our merges and now" | |
| }, | |
| { | |
| "start": 2566.4, | |
| "text": "how do we do encoding and decoding okay" | |
| }, | |
| { | |
| "start": 2568.44, | |
| "text": "so let's begin with decoding which is" | |
| }, | |
| { | |
| "start": 2570.44, | |
| "text": "this Arrow over here so given a token" | |
| }, | |
| { | |
| "start": 2572.72, | |
| "text": "sequence let's go through the tokenizer" | |
| }, | |
| { | |
| "start": 2574.92, | |
| "text": "to get back a python string object so" | |
| }, | |
| { | |
| "start": 2577.52, | |
| "text": "the raw text so this is the function" | |
| }, | |
| { | |
| "start": 2579.88, | |
| "text": "that we' like to implement um we're" | |
| }, | |
| { | |
| "start": 2581.88, | |
| "text": "given the list of integers and we want" | |
| }, | |
| { | |
| "start": 2583.44, | |
| "text": "to return a python string if you'd like" | |
| }, | |
| { | |
| "start": 2585.68, | |
| "text": "uh try to implement this function" | |
| }, | |
| { | |
| "start": 2586.839, | |
| "text": "yourself it's a fun exercise otherwise" | |
| }, | |
| { | |
| "start": 2588.839, | |
| "text": "I'm going to start uh pasting in my own" | |
| }, | |
| { | |
| "start": 2591.28, | |
| "text": "solution so there are many different" | |
| }, | |
| { | |
| "start": 2593.52, | |
| "text": "ways to do it um here's one way I will" | |
| }, | |
| { | |
| "start": 2596.88, | |
| "text": "create an uh kind of pre-processing" | |
| }, | |
| { | |
| "start": 2598.88, | |
| "text": "variable that I will call" | |
| }, | |
| { | |
| "start": 2601.04, | |
| "text": "vocab and vocab is a mapping or a" | |
| }, | |
| { | |
| "start": 2604.68, | |
| "text": "dictionary in Python for from the token" | |
| }, | |
| { | |
| "start": 2607.559, | |
| "text": "uh ID to the bytes object for that token" | |
| }, | |
| { | |
| "start": 2611.52, | |
| "text": "so we begin with the raw bytes for" | |
| }, | |
| { | |
| "start": 2613.8, | |
| "text": "tokens from 0 to 255 and then we go in" | |
| }, | |
| { | |
| "start": 2616.839, | |
| "text": "order of all the merges and we sort of" | |
| }, | |
| { | |
| "start": 2619.76, | |
| "text": "uh populate this vocab list by doing an" | |
| }, | |
| { | |
| "start": 2622.28, | |
| "text": "addition here so this is the basically" | |
| }, | |
| { | |
| "start": 2625.72, | |
| "text": "the bytes representation of the first" | |
| }, | |
| { | |
| "start": 2627.76, | |
| "text": "child followed by the second one and" | |
| }, | |
| { | |
| "start": 2630.04, | |
| "text": "remember these are bytes objects so this" | |
| }, | |
| { | |
| "start": 2632.079, | |
| "text": "addition here is an addition of two" | |
| }, | |
| { | |
| "start": 2634.2, | |
| "text": "bytes objects just concatenation" | |
| }, | |
| { | |
| "start": 2637.04, | |
| "text": "so that's what we get" | |
| }, | |
| { | |
| "start": 2638.76, | |
| "text": "here one tricky thing to be careful with" | |
| }, | |
| { | |
| "start": 2641.2, | |
| "text": "by the way is that I'm iterating a" | |
| }, | |
| { | |
| "start": 2642.88, | |
| "text": "dictionary in Python using a DOT items" | |
| }, | |
| { | |
| "start": 2646.0, | |
| "text": "and uh it really matters that this runs" | |
| }, | |
| { | |
| "start": 2648.72, | |
| "text": "in the order in which we inserted items" | |
| }, | |
| { | |
| "start": 2651.48, | |
| "text": "into the merous dictionary luckily" | |
| }, | |
| { | |
| "start": 2653.559, | |
| "text": "starting with python 3.7 this is" | |
| }, | |
| { | |
| "start": 2655.4, | |
| "text": "guaranteed to be the case but before" | |
| }, | |
| { | |
| "start": 2657.04, | |
| "text": "python 3.7 this iteration may have been" | |
| }, | |
| { | |
| "start": 2659.16, | |
| "text": "out of order with respect to how we" | |
| }, | |
| { | |
| "start": 2660.96, | |
| "text": "inserted elements into merges and this" | |
| }, | |
| { | |
| "start": 2663.16, | |
| "text": "may not have worked but we are using an" | |
| }, | |
| { | |
| "start": 2665.8, | |
| "text": "um modern python so we're okay and then" | |
| }, | |
| { | |
| "start": 2668.8, | |
| "text": "here uh given the IDS the first thing" | |
| }, | |
| { | |
| "start": 2671.599, | |
| "text": "we're going to do is get the" | |
| }, | |
| { | |
| "start": 2675.04, | |
| "text": "tokens so the way I implemented this" | |
| }, | |
| { | |
| "start": 2677.24, | |
| "text": "here is I'm taking I'm iterating over" | |
| }, | |
| { | |
| "start": 2679.599, | |
| "text": "all the IDS I'm using vocap to look up" | |
| }, | |
| { | |
| "start": 2681.88, | |
| "text": "their bytes and then here this is one" | |
| }, | |
| { | |
| "start": 2684.119, | |
| "text": "way in Python to concatenate all these" | |
| }, | |
| { | |
| "start": 2686.64, | |
| "text": "bytes together to create our tokens and" | |
| }, | |
| { | |
| "start": 2689.72, | |
| "text": "then these tokens here at this point are" | |
| }, | |
| { | |
| "start": 2691.72, | |
| "text": "raw bytes so I have to decode using UTF" | |
| }, | |
| { | |
| "start": 2696.0, | |
| "text": "F now back into python strings so" | |
| }, | |
| { | |
| "start": 2699.2, | |
| "text": "previously we called that encode on a" | |
| }, | |
| { | |
| "start": 2701.16, | |
| "text": "string object to get the bytes and now" | |
| }, | |
| { | |
| "start": 2703.2, | |
| "text": "we're doing it Opposite we're taking the" | |
| }, | |
| { | |
| "start": 2705.2, | |
| "text": "bytes and calling a decode on the bytes" | |
| }, | |
| { | |
| "start": 2707.8, | |
| "text": "object to get a string in Python and" | |
| }, | |
| { | |
| "start": 2711.0, | |
| "text": "then we can return" | |
| }, | |
| { | |
| "start": 2713.319, | |
| "text": "text so um this is how we can do it now" | |
| }, | |
| { | |
| "start": 2716.96, | |
| "text": "this actually has a um issue um in the" | |
| }, | |
| { | |
| "start": 2720.8, | |
| "text": "way I implemented it and this could" | |
| }, | |
| { | |
| "start": 2722.119, | |
| "text": "actually throw an error so try to think" | |
| }, | |
| { | |
| "start": 2724.119, | |
| "text": "figure out why this code could actually" | |
| }, | |
| { | |
| "start": 2726.48, | |
| "text": "result in an error if we plug in um uh" | |
| }, | |
| { | |
| "start": 2730.24, | |
| "text": "some sequence of IDs that is" | |
| }, | |
| { | |
| "start": 2732.599, | |
| "text": "unlucky so let me demonstrate the issue" | |
| }, | |
| { | |
| "start": 2735.24, | |
| "text": "when I try to decode just something like" | |
| }, | |
| { | |
| "start": 2737.16, | |
| "text": "97 I am going to get letter A here back" | |
| }, | |
| { | |
| "start": 2741.079, | |
| "text": "so nothing too crazy happening but when" | |
| }, | |
| { | |
| "start": 2744.4, | |
| "text": "I try to decode 128 as a single element" | |
| }, | |
| { | |
| "start": 2748.24, | |
| "text": "the token 128 is what in string or in" | |
| }, | |
| { | |
| "start": 2751.319, | |
| "text": "Python object uni Cod decoder utfa can't" | |
| }, | |
| { | |
| "start": 2755.119, | |
| "text": "Decode by um 0x8 which is this in HEX in" | |
| }, | |
| { | |
| "start": 2760.119, | |
| "text": "position zero invalid start bite what" | |
| }, | |
| { | |
| "start": 2761.92, | |
| "text": "does that mean well to understand what" | |
| }, | |
| { | |
| "start": 2763.64, | |
| "text": "this means we have to go back to our" | |
| }, | |
| { | |
| "start": 2764.76, | |
| "text": "utf8 page uh that I briefly showed" | |
| }, | |
| { | |
| "start": 2767.92, | |
| "text": "earlier and this is Wikipedia utf8 and" | |
| }, | |
| { | |
| "start": 2770.76, | |
| "text": "basically there's a specific schema that" | |
| }, | |
| { | |
| "start": 2773.559, | |
| "text": "utfa bytes take so in particular if you" | |
| }, | |
| { | |
| "start": 2776.92, | |
| "text": "have a multi-te object for some of the" | |
| }, | |
| { | |
| "start": 2779.839, | |
| "text": "Unicode characters they have to have" | |
| }, | |
| { | |
| "start": 2781.52, | |
| "text": "this special sort of envelope in how the" | |
| }, | |
| { | |
| "start": 2784.16, | |
| "text": "encoding works and so what's happening" | |
| }, | |
| { | |
| "start": 2786.52, | |
| "text": "here is that invalid start pite that's" | |
| }, | |
| { | |
| "start": 2790.0, | |
| "text": "because" | |
| }, | |
| { | |
| "start": 2791.0, | |
| "text": "128 the binary representation of it is" | |
| }, | |
| { | |
| "start": 2793.88, | |
| "text": "one followed by all zeros so we have one" | |
| }, | |
| { | |
| "start": 2797.359, | |
| "text": "and then all zero and we see here that" | |
| }, | |
| { | |
| "start": 2799.559, | |
| "text": "that doesn't conform to the format" | |
| }, | |
| { | |
| "start": 2801.04, | |
| "text": "because one followed by all zero just" | |
| }, | |
| { | |
| "start": 2802.68, | |
| "text": "doesn't fit any of these rules so to" | |
| }, | |
| { | |
| "start": 2804.96, | |
| "text": "speak so it's an invalid start bite" | |
| }, | |
| { | |
| "start": 2807.64, | |
| "text": "which is byte one this one must have a" | |
| }, | |
| { | |
| "start": 2810.599, | |
| "text": "one following it and then a zero" | |
| }, | |
| { | |
| "start": 2812.76, | |
| "text": "following it and then the content of" | |
| }, | |
| { | |
| "start": 2814.48, | |
| "text": "your uni codee in x here so basically we" | |
| }, | |
| { | |
| "start": 2817.68, | |
| "text": "don't um exactly follow the utf8" | |
| }, | |
| { | |
| "start": 2819.96, | |
| "text": "standard and this cannot be decoded and" | |
| }, | |
| { | |
| "start": 2822.52, | |
| "text": "so the way to fix this um is to" | |
| }, | |
| { | |
| "start": 2826.28, | |
| "text": "use this errors equals in bytes. decode" | |
| }, | |
| { | |
| "start": 2831.04, | |
| "text": "function of python and by default errors" | |
| }, | |
| { | |
| "start": 2833.839, | |
| "text": "is strict so we will throw an error if" | |
| }, | |
| { | |
| "start": 2837.16, | |
| "text": "um it's not valid utf8 bytes encoding" | |
| }, | |
| { | |
| "start": 2840.28, | |
| "text": "but there are many different things that" | |
| }, | |
| { | |
| "start": 2841.68, | |
| "text": "you could put here on error handling" | |
| }, | |
| { | |
| "start": 2843.68, | |
| "text": "this is the full list of all the errors" | |
| }, | |
| { | |
| "start": 2845.359, | |
| "text": "that you can use and in particular" | |
| }, | |
| { | |
| "start": 2847.359, | |
| "text": "instead of strict let's change it to" | |
| }, | |
| { | |
| "start": 2849.359, | |
| "text": "replace and that will replace uh with" | |
| }, | |
| { | |
| "start": 2852.28, | |
| "text": "this special marker this replacement" | |
| }, | |
| { | |
| "start": 2855.8, | |
| "text": "character so errors equals replace and" | |
| }, | |
| { | |
| "start": 2860.52, | |
| "text": "now we just get that character" | |
| }, | |
| { | |
| "start": 2863.16, | |
| "text": "back so basically not every single by" | |
| }, | |
| { | |
| "start": 2866.96, | |
| "text": "sequence is valid" | |
| }, | |
| { | |
| "start": 2868.52, | |
| "text": "utf8 and if it happens that your large" | |
| }, | |
| { | |
| "start": 2871.48, | |
| "text": "language model for example predicts your" | |
| }, | |
| { | |
| "start": 2873.88, | |
| "text": "tokens in a bad manner then they might" | |
| }, | |
| { | |
| "start": 2876.64, | |
| "text": "not fall into valid utf8 and then we" | |
| }, | |
| { | |
| "start": 2880.24, | |
| "text": "won't be able to decode them so the" | |
| }, | |
| { | |
| "start": 2882.88, | |
| "text": "standard practice is to basically uh use" | |
| }, | |
| { | |
| "start": 2885.64, | |
| "text": "errors equals replace and this is what" | |
| }, | |
| { | |
| "start": 2887.52, | |
| "text": "you will also find in the openai um code" | |
| }, | |
| { | |
| "start": 2890.319, | |
| "text": "that they released as well but basically" | |
| }, | |
| { | |
| "start": 2892.72, | |
| "text": "whenever you see um this kind of a" | |
| }, | |
| { | |
| "start": 2894.2, | |
| "text": "character in your output in that case uh" | |
| }, | |
| { | |
| "start": 2896.0, | |
| "text": "something went wrong and the LM output" | |
| }, | |
| { | |
| "start": 2898.16, | |
| "text": "not was not valid uh sort of sequence of" | |
| }, | |
| { | |
| "start": 2901.52, | |
| "text": "tokens okay and now we're going to go" | |
| }, | |
| { | |
| "start": 2903.48, | |
| "text": "the other way so we are going to" | |
| }, | |
| { | |
| "start": 2905.319, | |
| "text": "implement" | |
| }, | |
| { | |
| "start": 2906.24, | |
| "text": "this Arrow right here where we are going" | |
| }, | |
| { | |
| "start": 2907.96, | |
| "text": "to be given a string and we want to" | |
| }, | |
| { | |
| "start": 2909.64, | |
| "text": "encode it into" | |
| }, | |
| { | |
| "start": 2911.16, | |
| "text": "tokens so this is the signature of the" | |
| }, | |
| { | |
| "start": 2913.72, | |
| "text": "function that we're interested in and um" | |
| }, | |
| { | |
| "start": 2916.92, | |
| "text": "this should basically print a list of" | |
| }, | |
| { | |
| "start": 2918.16, | |
| "text": "integers of the tokens so again uh try" | |
| }, | |
| { | |
| "start": 2921.76, | |
| "text": "to maybe implement this yourself if" | |
| }, | |
| { | |
| "start": 2923.04, | |
| "text": "you'd like a fun exercise uh and pause" | |
| }, | |
| { | |
| "start": 2925.559, | |
| "text": "here otherwise I'm going to start" | |
| }, | |
| { | |
| "start": 2926.52, | |
| "text": "putting in my" | |
| }, | |
| { | |
| "start": 2927.96, | |
| "text": "solution so again there are many ways to" | |
| }, | |
| { | |
| "start": 2930.28, | |
| "text": "do this so um this is one of the ways" | |
| }, | |
| { | |
| "start": 2933.64, | |
| "text": "that sort of I came came up with so the" | |
| }, | |
| { | |
| "start": 2937.599, | |
| "text": "first thing we're going to do is we are" | |
| }, | |
| { | |
| "start": 2939.16, | |
| "text": "going" | |
| }, | |
| { | |
| "start": 2940.119, | |
| "text": "to uh take our text encode it into utf8" | |
| }, | |
| { | |
| "start": 2943.44, | |
| "text": "to get the raw bytes and then as before" | |
| }, | |
| { | |
| "start": 2945.799, | |
| "text": "we're going to call list on the bytes" | |
| }, | |
| { | |
| "start": 2947.28, | |
| "text": "object to get a list of integers of" | |
| }, | |
| { | |
| "start": 2950.079, | |
| "text": "those bytes so those are the starting" | |
| }, | |
| { | |
| "start": 2952.76, | |
| "text": "tokens those are the raw bytes of our" | |
| }, | |
| { | |
| "start": 2954.599, | |
| "text": "sequence but now of course according to" | |
| }, | |
| { | |
| "start": 2956.96, | |
| "text": "the merges dictionary above and recall" | |
| }, | |
| { | |
| "start": 2959.559, | |
| "text": "this was the" | |
| }, | |
| { | |
| "start": 2961.079, | |
| "text": "merges some of the bytes may be merged" | |
| }, | |
| { | |
| "start": 2963.96, | |
| "text": "according to this lookup in addition to" | |
| }, | |
| { | |
| "start": 2966.559, | |
| "text": "that remember that the merges was built" | |
| }, | |
| { | |
| "start": 2968.16, | |
| "text": "from top to bottom and this is sort of" | |
| }, | |
| { | |
| "start": 2969.92, | |
| "text": "the order in which we inserted stuff" | |
| }, | |
| { | |
| "start": 2971.359, | |
| "text": "into merges and so we prefer to do all" | |
| }, | |
| { | |
| "start": 2974.28, | |
| "text": "these merges in the beginning before we" | |
| }, | |
| { | |
| "start": 2976.119, | |
| "text": "do these merges later because um for" | |
| }, | |
| { | |
| "start": 2979.2, | |
| "text": "example this merge over here relies on" | |
| }, | |
| { | |
| "start": 2980.96, | |
| "text": "the 256 which got merged here so we have" | |
| }, | |
| { | |
| "start": 2984.64, | |
| "text": "to go in the order from top to bottom" | |
| }, | |
| { | |
| "start": 2986.92, | |
| "text": "sort of if we are going to be merging" | |
| }, | |
| { | |
| "start": 2988.92, | |
| "text": "anything now we expect to be doing a few" | |
| }, | |
| { | |
| "start": 2991.44, | |
| "text": "merges so we're going to be doing W" | |
| }, | |
| { | |
| "start": 2994.52, | |
| "text": "true um and now we want to find a pair" | |
| }, | |
| { | |
| "start": 2998.079, | |
| "text": "of byes that is consecutive that we are" | |
| }, | |
| { | |
| "start": 3000.72, | |
| "text": "allowed to merge according to this in" | |
| }, | |
| { | |
| "start": 3003.599, | |
| "text": "order to reuse some of the functionality" | |
| }, | |
| { | |
| "start": 3005.0, | |
| "text": "that we've already written I'm going to" | |
| }, | |
| { | |
| "start": 3006.559, | |
| "text": "reuse the function uh get" | |
| }, | |
| { | |
| "start": 3009.079, | |
| "text": "stats so recall that get stats uh will" | |
| }, | |
| { | |
| "start": 3012.079, | |
| "text": "give us the we'll basically count up how" | |
| }, | |
| { | |
| "start": 3014.24, | |
| "text": "many times every single pair occurs in" | |
| }, | |
| { | |
| "start": 3016.599, | |
| "text": "our sequence of tokens and return that" | |
| }, | |
| { | |
| "start": 3018.92, | |
| "text": "as a dictionary and the dictionary was a" | |
| }, | |
| { | |
| "start": 3022.079, | |
| "text": "mapping from all the different uh by" | |
| }, | |
| { | |
| "start": 3025.599, | |
| "text": "pairs to the number of times that they" | |
| }, | |
| { | |
| "start": 3027.4, | |
| "text": "occur right um at this point we don't" | |
| }, | |
| { | |
| "start": 3030.28, | |
| "text": "actually care how many times they occur" | |
| }, | |
| { | |
| "start": 3032.359, | |
| "text": "in the sequence we only care what the" | |
| }, | |
| { | |
| "start": 3034.359, | |
| "text": "raw pairs are in that sequence and so" | |
| }, | |
| { | |
| "start": 3036.839, | |
| "text": "I'm only going to be using basically the" | |
| }, | |
| { | |
| "start": 3038.28, | |
| "text": "keys of the dictionary I only care about" | |
| }, | |
| { | |
| "start": 3040.44, | |
| "text": "the set of possible merge candidates if" | |
| }, | |
| { | |
| "start": 3042.92, | |
| "text": "that makes" | |
| }, | |
| { | |
| "start": 3043.76, | |
| "text": "sense now we want to identify the pair" | |
| }, | |
| { | |
| "start": 3046.16, | |
| "text": "that we're going to be merging at this" | |
| }, | |
| { | |
| "start": 3047.72, | |
| "text": "stage of the loop so what do we want we" | |
| }, | |
| { | |
| "start": 3050.24, | |
| "text": "want to find the pair or like the a key" | |
| }, | |
| { | |
| "start": 3053.24, | |
| "text": "inside stats that has the lowest index" | |
| }, | |
| { | |
| "start": 3057.079, | |
| "text": "in the merges uh dictionary because we" | |
| }, | |
| { | |
| "start": 3059.64, | |
| "text": "want to do all the early merges before" | |
| }, | |
| { | |
| "start": 3061.28, | |
| "text": "we work our way to the late" | |
| }, | |
| { | |
| "start": 3063.079, | |
| "text": "merges so again there are many different" | |
| }, | |
| { | |
| "start": 3065.319, | |
| "text": "ways to implement this but I'm going to" | |
| }, | |
| { | |
| "start": 3067.72, | |
| "text": "do something a little bit fancy" | |
| }, | |
| { | |
| "start": 3071.28, | |
| "text": "here so I'm going to be using the Min" | |
| }, | |
| { | |
| "start": 3074.2, | |
| "text": "over an iterator in Python when you call" | |
| }, | |
| { | |
| "start": 3076.799, | |
| "text": "Min on an iterator and stats here as a" | |
| }, | |
| { | |
| "start": 3078.96, | |
| "text": "dictionary we're going to be iterating" | |
| }, | |
| { | |
| "start": 3080.839, | |
| "text": "the keys of this dictionary in Python so" | |
| }, | |
| { | |
| "start": 3084.119, | |
| "text": "we're looking at all the pairs inside" | |
| }, | |
| { | |
| "start": 3087.079, | |
| "text": "stats um which are all the consecutive" | |
| }, | |
| { | |
| "start": 3089.359, | |
| "text": "Pairs and we're going to be taking the" | |
| }, | |
| { | |
| "start": 3092.079, | |
| "text": "consecutive pair inside tokens that has" | |
| }, | |
| { | |
| "start": 3094.44, | |
| "text": "the minimum what the Min takes a key" | |
| }, | |
| { | |
| "start": 3098.88, | |
| "text": "which gives us the function that is" | |
| }, | |
| { | |
| "start": 3100.319, | |
| "text": "going to return a value over which we're" | |
| }, | |
| { | |
| "start": 3102.359, | |
| "text": "going to do the Min and the one we care" | |
| }, | |
| { | |
| "start": 3104.96, | |
| "text": "about is we're we care about taking" | |
| }, | |
| { | |
| "start": 3106.44, | |
| "text": "merges and basically getting um that" | |
| }, | |
| { | |
| "start": 3110.92, | |
| "text": "pairs" | |
| }, | |
| { | |
| "start": 3112.839, | |
| "text": "index so basically for any pair inside" | |
| }, | |
| { | |
| "start": 3117.16, | |
| "text": "stats we are going to be looking into" | |
| }, | |
| { | |
| "start": 3119.72, | |
| "text": "merges at what index it has and we want" | |
| }, | |
| { | |
| "start": 3123.079, | |
| "text": "to get the pair with the Min number so" | |
| }, | |
| { | |
| "start": 3125.839, | |
| "text": "as an example if there's a pair 101 and" | |
| }, | |
| { | |
| "start": 3127.559, | |
| "text": "32 we definitely want to get that pair" | |
| }, | |
| { | |
| "start": 3130.44, | |
| "text": "uh we want to identify it here and" | |
| }, | |
| { | |
| "start": 3131.92, | |
| "text": "return it and pair would become 10132 if" | |
| }, | |
| { | |
| "start": 3135.04, | |
| "text": "it" | |
| }, | |
| { | |
| "start": 3135.76, | |
| "text": "occurs and the reason that I'm putting a" | |
| }, | |
| { | |
| "start": 3137.96, | |
| "text": "float INF here as a fall back is that in" | |
| }, | |
| { | |
| "start": 3141.4, | |
| "text": "the get function when we call uh when we" | |
| }, | |
| { | |
| "start": 3144.2, | |
| "text": "basically consider a pair that doesn't" | |
| }, | |
| { | |
| "start": 3146.599, | |
| "text": "occur in the merges then that pair is" | |
| }, | |
| { | |
| "start": 3149.0, | |
| "text": "not eligible to be merged right so if in" | |
| }, | |
| { | |
| "start": 3151.88, | |
| "text": "the token sequence there's some pair" | |
| }, | |
| { | |
| "start": 3153.48, | |
| "text": "that is not a merging pair it cannot be" | |
| }, | |
| { | |
| "start": 3155.559, | |
| "text": "merged then uh it doesn't actually occur" | |
| }, | |
| { | |
| "start": 3158.119, | |
| "text": "here and it doesn't have an index and uh" | |
| }, | |
| { | |
| "start": 3160.839, | |
| "text": "it cannot be merged which we will denote" | |
| }, | |
| { | |
| "start": 3162.599, | |
| "text": "as float INF and the reason Infinity is" | |
| }, | |
| { | |
| "start": 3165.079, | |
| "text": "nice here is because for sure we're" | |
| }, | |
| { | |
| "start": 3166.599, | |
| "text": "guaranteed that it's not going to" | |
| }, | |
| { | |
| "start": 3168.079, | |
| "text": "participate in the list of candidates" | |
| }, | |
| { | |
| "start": 3170.04, | |
| "text": "when we do the men so uh so this is one" | |
| }, | |
| { | |
| "start": 3173.44, | |
| "text": "way to do it so B basically long story" | |
| }, | |
| { | |
| "start": 3175.88, | |
| "text": "short this Returns the most eligible" | |
| }, | |
| { | |
| "start": 3178.28, | |
| "text": "merging candidate pair uh that occurs in" | |
| }, | |
| { | |
| "start": 3181.119, | |
| "text": "the tokens now one thing to be careful" | |
| }, | |
| { | |
| "start": 3184.079, | |
| "text": "with here is this uh function here might" | |
| }, | |
| { | |
| "start": 3187.48, | |
| "text": "fail in the following way if there's" | |
| }, | |
| { | |
| "start": 3189.88, | |
| "text": "nothing to merge then uh uh then there's" | |
| }, | |
| { | |
| "start": 3193.599, | |
| "text": "nothing in merges um that satisfi that" | |
| }, | |
| { | |
| "start": 3196.92, | |
| "text": "is satisfied anymore there's nothing to" | |
| }, | |
| { | |
| "start": 3198.559, | |
| "text": "merge everything just returns float imps" | |
| }, | |
| { | |
| "start": 3201.72, | |
| "text": "and then the pair I think will just" | |
| }, | |
| { | |
| "start": 3203.68, | |
| "text": "become the very first element of stats" | |
| }, | |
| { | |
| "start": 3206.96, | |
| "text": "um but this pair is not actually a" | |
| }, | |
| { | |
| "start": 3208.359, | |
| "text": "mergeable pair it just becomes the first" | |
| }, | |
| { | |
| "start": 3211.16, | |
| "text": "pair inside stats arbitrarily because" | |
| }, | |
| { | |
| "start": 3213.28, | |
| "text": "all of these pairs evaluate to float in" | |
| }, | |
| { | |
| "start": 3216.319, | |
| "text": "for the merging Criterion so basically" | |
| }, | |
| { | |
| "start": 3218.559, | |
| "text": "it could be that this this doesn't look" | |
| }, | |
| { | |
| "start": 3220.359, | |
| "text": "succeed because there's no more merging" | |
| }, | |
| { | |
| "start": 3221.64, | |
| "text": "pairs so if this pair is not in merges" | |
| }, | |
| { | |
| "start": 3224.64, | |
| "text": "that was returned then this is a signal" | |
| }, | |
| { | |
| "start": 3226.839, | |
| "text": "for us that actually there was nothing" | |
| }, | |
| { | |
| "start": 3228.4, | |
| "text": "to merge no single pair can be merged" | |
| }, | |
| { | |
| "start": 3230.72, | |
| "text": "anymore in that case we will break" | |
| }, | |
| { | |
| "start": 3233.079, | |
| "text": "out um nothing else can be" | |
| }, | |
| { | |
| "start": 3237.88, | |
| "text": "merged you may come up with a different" | |
| }, | |
| { | |
| "start": 3239.839, | |
| "text": "implementation by the way this is kind" | |
| }, | |
| { | |
| "start": 3241.04, | |
| "text": "of like really trying hard in" | |
| }, | |
| { | |
| "start": 3243.88, | |
| "text": "Python um but really we're just trying" | |
| }, | |
| { | |
| "start": 3245.96, | |
| "text": "to find a pair that can be merged with" | |
| }, | |
| { | |
| "start": 3247.799, | |
| "text": "the lowest index" | |
| }, | |
| { | |
| "start": 3249.599, | |
| "text": "here now if we did find a pair that is" | |
| }, | |
| { | |
| "start": 3253.88, | |
| "text": "inside merges with the lowest index then" | |
| }, | |
| { | |
| "start": 3256.28, | |
| "text": "we can merge it" | |
| }, | |
| { | |
| "start": 3259.839, | |
| "text": "so we're going to look into the merger" | |
| }, | |
| { | |
| "start": 3262.04, | |
| "text": "dictionary for that pair to look up the" | |
| }, | |
| { | |
| "start": 3264.28, | |
| "text": "index and we're going to now merge that" | |
| }, | |
| { | |
| "start": 3267.28, | |
| "text": "into that index so we're going to do" | |
| }, | |
| { | |
| "start": 3269.24, | |
| "text": "tokens equals and we're going to" | |
| }, | |
| { | |
| "start": 3272.24, | |
| "text": "replace the original tokens we're going" | |
| }, | |
| { | |
| "start": 3274.64, | |
| "text": "to be replacing the pair pair and we're" | |
| }, | |
| { | |
| "start": 3276.76, | |
| "text": "going to be replacing it with index idx" | |
| }, | |
| { | |
| "start": 3278.96, | |
| "text": "and this returns a new list of tokens" | |
| }, | |
| { | |
| "start": 3281.64, | |
| "text": "where every occurrence of pair is" | |
| }, | |
| { | |
| "start": 3283.16, | |
| "text": "replaced with idx so we're doing a merge" | |
| }, | |
| { | |
| "start": 3286.28, | |
| "text": "and we're going to be continuing this" | |
| }, | |
| { | |
| "start": 3287.599, | |
| "text": "until eventually nothing can be merged" | |
| }, | |
| { | |
| "start": 3289.28, | |
| "text": "we'll come out here and we'll break out" | |
| }, | |
| { | |
| "start": 3291.28, | |
| "text": "and here we just return" | |
| }, | |
| { | |
| "start": 3293.319, | |
| "text": "tokens and so that that's the" | |
| }, | |
| { | |
| "start": 3295.839, | |
| "text": "implementation I think so hopefully this" | |
| }, | |
| { | |
| "start": 3297.44, | |
| "text": "runs okay cool um yeah and this looks uh" | |
| }, | |
| { | |
| "start": 3302.44, | |
| "text": "reasonable so for example 32 is a space" | |
| }, | |
| { | |
| "start": 3304.88, | |
| "text": "in asky so that's here um so this looks" | |
| }, | |
| { | |
| "start": 3309.2, | |
| "text": "like it worked great okay so let's wrap" | |
| }, | |
| { | |
| "start": 3311.48, | |
| "text": "up this section of the video at least I" | |
| }, | |
| { | |
| "start": 3313.48, | |
| "text": "wanted to point out that this is not" | |
| }, | |
| { | |
| "start": 3314.88, | |
| "text": "quite the right implementation just yet" | |
| }, | |
| { | |
| "start": 3316.359, | |
| "text": "because we are leaving out a special" | |
| }, | |
| { | |
| "start": 3317.96, | |
| "text": "case so in particular if uh we try to do" | |
| }, | |
| { | |
| "start": 3320.68, | |
| "text": "this this would give us an error and the" | |
| }, | |
| { | |
| "start": 3323.559, | |
| "text": "issue is that um if we only have a" | |
| }, | |
| { | |
| "start": 3325.64, | |
| "text": "single character or an empty string then" | |
| }, | |
| { | |
| "start": 3328.039, | |
| "text": "stats is empty and that causes an issue" | |
| }, | |
| { | |
| "start": 3329.839, | |
| "text": "inside Min so one way to fight this is" | |
| }, | |
| { | |
| "start": 3332.96, | |
| "text": "if L of tokens is at least two because" | |
| }, | |
| { | |
| "start": 3336.359, | |
| "text": "if it's less than two it's just a single" | |
| }, | |
| { | |
| "start": 3337.839, | |
| "text": "token or no tokens then let's just uh" | |
| }, | |
| { | |
| "start": 3340.079, | |
| "text": "there's nothing to merge so we just" | |
| }, | |
| { | |
| "start": 3341.52, | |
| "text": "return so that would fix uh that" | |
| }, | |
| { | |
| "start": 3344.64, | |
| "text": "case Okay and then second I have a few" | |
| }, | |
| { | |
| "start": 3348.079, | |
| "text": "test cases here for us as well so first" | |
| }, | |
| { | |
| "start": 3350.44, | |
| "text": "let's make sure uh about or let's note" | |
| }, | |
| { | |
| "start": 3353.359, | |
| "text": "the following if we take a string and we" | |
| }, | |
| { | |
| "start": 3356.44, | |
| "text": "try to encode it and then decode it back" | |
| }, | |
| { | |
| "start": 3358.64, | |
| "text": "you'd expect to get the same string back" | |
| }, | |
| { | |
| "start": 3360.24, | |
| "text": "right is that true for all" | |
| }, | |
| { | |
| "start": 3364.68, | |
| "text": "strings so I think uh so here it is the" | |
| }, | |
| { | |
| "start": 3367.16, | |
| "text": "case and I think in general this is" | |
| }, | |
| { | |
| "start": 3368.72, | |
| "text": "probably the case um but notice that" | |
| }, | |
| { | |
| "start": 3372.039, | |
| "text": "going backwards is not is not you're not" | |
| }, | |
| { | |
| "start": 3374.64, | |
| "text": "going to have an identity going" | |
| }, | |
| { | |
| "start": 3375.92, | |
| "text": "backwards because as I mentioned us not" | |
| }, | |
| { | |
| "start": 3379.2, | |
| "text": "all token sequences are valid utf8 uh" | |
| }, | |
| { | |
| "start": 3382.96, | |
| "text": "sort of by streams and so so therefore" | |
| }, | |
| { | |
| "start": 3385.44, | |
| "text": "you're some of them can't even be" | |
| }, | |
| { | |
| "start": 3387.2, | |
| "text": "decodable um so this only goes in One" | |
| }, | |
| { | |
| "start": 3390.48, | |
| "text": "Direction but for that one direction we" | |
| }, | |
| { | |
| "start": 3392.92, | |
| "text": "can check uh here if we take the" | |
| }, | |
| { | |
| "start": 3394.76, | |
| "text": "training text which is the text that we" | |
| }, | |
| { | |
| "start": 3396.319, | |
| "text": "train to tokenizer around we can make" | |
| }, | |
| { | |
| "start": 3398.0, | |
| "text": "sure that when we encode and decode we" | |
| }, | |
| { | |
| "start": 3399.44, | |
| "text": "get the same thing back which is true" | |
| }, | |
| { | |
| "start": 3401.96, | |
| "text": "and here I took some validation data so" | |
| }, | |
| { | |
| "start": 3403.839, | |
| "text": "I went to I think this web page and I" | |
| }, | |
| { | |
| "start": 3405.599, | |
| "text": "grabbed some text so this is text that" | |
| }, | |
| { | |
| "start": 3407.76, | |
| "text": "the tokenizer has not seen and we can" | |
| }, | |
| { | |
| "start": 3409.68, | |
| "text": "make sure that this also works um okay" | |
| }, | |
| { | |
| "start": 3412.72, | |
| "text": "so that gives us some confidence that" | |
| }, | |
| { | |
| "start": 3413.92, | |
| "text": "this was correctly implemented" | |
| }, | |
| { | |
| "start": 3416.0, | |
| "text": "so those are the basics of the bite pair" | |
| }, | |
| { | |
| "start": 3418.039, | |
| "text": "encoding algorithm we saw how we can uh" | |
| }, | |
| { | |
| "start": 3420.72, | |
| "text": "take some training set train a tokenizer" | |
| }, | |
| { | |
| "start": 3423.68, | |
| "text": "the parameters of this tokenizer really" | |
| }, | |
| { | |
| "start": 3425.44, | |
| "text": "are just this dictionary of merges and" | |
| }, | |
| { | |
| "start": 3428.119, | |
| "text": "that basically creates the little binary" | |
| }, | |
| { | |
| "start": 3429.599, | |
| "text": "Forest on top of raw" | |
| }, | |
| { | |
| "start": 3431.559, | |
| "text": "bites once we have this the merges table" | |
| }, | |
| { | |
| "start": 3434.68, | |
| "text": "we can both encode and decode between" | |
| }, | |
| { | |
| "start": 3436.799, | |
| "text": "raw text and token sequences so that's" | |
| }, | |
| { | |
| "start": 3439.4, | |
| "text": "the the simplest setting of The" | |
| }, | |
| { | |
| "start": 3441.28, | |
| "text": "tokenizer what we're going to do now" | |
| }, | |
| { | |
| "start": 3443.2, | |
| "text": "though is we're going to look at some of" | |
| }, | |
| { | |
| "start": 3444.48, | |
| "text": "the St the art lar language models and" | |
| }, | |
| { | |
| "start": 3446.559, | |
| "text": "the kinds of tokenizers that they use" | |
| }, | |
| { | |
| "start": 3448.359, | |
| "text": "and we're going to see that this picture" | |
| }, | |
| { | |
| "start": 3449.559, | |
| "text": "complexifies very quickly so we're going" | |
| }, | |
| { | |
| "start": 3451.64, | |
| "text": "to go through the details of this comp" | |
| }, | |
| { | |
| "start": 3454.599, | |
| "text": "complexification one at a time so let's" | |
| }, | |
| { | |
| "start": 3457.52, | |
| "text": "kick things off by looking at the GPD" | |
| }, | |
| { | |
| "start": 3459.039, | |
| "text": "Series so in particular I have the gpt2" | |
| }, | |
| { | |
| "start": 3461.64, | |
| "text": "paper here um and this paper is from" | |
| }, | |
| { | |
| "start": 3464.64, | |
| "text": "2019 or so so 5 years ago and let's" | |
| }, | |
| { | |
| "start": 3468.359, | |
| "text": "scroll down to input representation this" | |
| }, | |
| { | |
| "start": 3471.28, | |
| "text": "is where they talk about the tokenizer" | |
| }, | |
| { | |
| "start": 3472.68, | |
| "text": "that they're using for gpd2 now this is" | |
| }, | |
| { | |
| "start": 3475.64, | |
| "text": "all fairly readable so I encourage you" | |
| }, | |
| { | |
| "start": 3477.039, | |
| "text": "to pause and um read this yourself but" | |
| }, | |
| { | |
| "start": 3480.039, | |
| "text": "this is where they motivate the use of" | |
| }, | |
| { | |
| "start": 3482.0, | |
| "text": "the bite pair encoding algorithm on the" | |
| }, | |
| { | |
| "start": 3484.68, | |
| "text": "bite level representation of utf8" | |
| }, | |
| { | |
| "start": 3487.52, | |
| "text": "encoding so this is where they motivate" | |
| }, | |
| { | |
| "start": 3489.52, | |
| "text": "it and they talk about the vocabulary" | |
| }, | |
| { | |
| "start": 3491.079, | |
| "text": "sizes and everything now everything here" | |
| }, | |
| { | |
| "start": 3493.839, | |
| "text": "is exactly as we've covered it so far" | |
| }, | |
| { | |
| "start": 3495.92, | |
| "text": "but things start to depart around here" | |
| }, | |
| { | |
| "start": 3498.559, | |
| "text": "so what they mention is that they don't" | |
| }, | |
| { | |
| "start": 3500.44, | |
| "text": "just apply the naive algorithm as we" | |
| }, | |
| { | |
| "start": 3502.28, | |
| "text": "have done it and in particular here's a" | |
| }, | |
| { | |
| "start": 3505.16, | |
| "text": "example suppose that you have common" | |
| }, | |
| { | |
| "start": 3507.0, | |
| "text": "words like dog what will happen is that" | |
| }, | |
| { | |
| "start": 3509.48, | |
| "text": "dog of course occurs very frequently in" | |
| }, | |
| { | |
| "start": 3511.64, | |
| "text": "the text and it occurs right next to all" | |
| }, | |
| { | |
| "start": 3514.28, | |
| "text": "kinds of punctuation as an example so" | |
| }, | |
| { | |
| "start": 3516.4, | |
| "text": "doc dot dog exclamation mark dog" | |
| }, | |
| { | |
| "start": 3519.16, | |
| "text": "question mark Etc and naively you might" | |
| }, | |
| { | |
| "start": 3522.24, | |
| "text": "imagine that the BP algorithm could" | |
| }, | |
| { | |
| "start": 3523.64, | |
| "text": "merge these to be single tokens and then" | |
| }, | |
| { | |
| "start": 3525.76, | |
| "text": "you end up with lots of tokens that are" | |
| }, | |
| { | |
| "start": 3527.44, | |
| "text": "just like dog with a slightly different" | |
| }, | |
| { | |
| "start": 3529.0, | |
| "text": "punctuation and so it feels like you're" | |
| }, | |
| { | |
| "start": 3530.88, | |
| "text": "clustering things that shouldn't be" | |
| }, | |
| { | |
| "start": 3532.039, | |
| "text": "clustered you're combining kind of" | |
| }, | |
| { | |
| "start": 3533.64, | |
| "text": "semantics with" | |
| }, | |
| { | |
| "start": 3535.559, | |
| "text": "uation and this uh feels suboptimal and" | |
| }, | |
| { | |
| "start": 3538.92, | |
| "text": "indeed they also say that this is" | |
| }, | |
| { | |
| "start": 3540.96, | |
| "text": "suboptimal according to some of the" | |
| }, | |
| { | |
| "start": 3542.359, | |
| "text": "experiments so what they want to do is" | |
| }, | |
| { | |
| "start": 3544.2, | |
| "text": "they want to top down in a manual way" | |
| }, | |
| { | |
| "start": 3546.319, | |
| "text": "enforce that some types of um characters" | |
| }, | |
| { | |
| "start": 3549.599, | |
| "text": "should never be merged together um so" | |
| }, | |
| { | |
| "start": 3552.76, | |
| "text": "they want to enforce these merging rules" | |
| }, | |
| { | |
| "start": 3554.799, | |
| "text": "on top of the bite PA encoding algorithm" | |
| }, | |
| { | |
| "start": 3557.68, | |
| "text": "so let's take a look um at their code" | |
| }, | |
| { | |
| "start": 3559.88, | |
| "text": "and see how they actually enforce this" | |
| }, | |
| { | |
| "start": 3561.48, | |
| "text": "and what kinds of mergy they actually do" | |
| }, | |
| { | |
| "start": 3563.2, | |
| "text": "perform so I have to to tab open here" | |
| }, | |
| { | |
| "start": 3565.839, | |
| "text": "for gpt2 under open AI on GitHub and" | |
| }, | |
| { | |
| "start": 3569.64, | |
| "text": "when we go to" | |
| }, | |
| { | |
| "start": 3570.68, | |
| "text": "Source there is an encoder thatp now I" | |
| }, | |
| { | |
| "start": 3574.28, | |
| "text": "don't personally love that they call it" | |
| }, | |
| { | |
| "start": 3575.599, | |
| "text": "encoder dopy because this is the" | |
| }, | |
| { | |
| "start": 3577.079, | |
| "text": "tokenizer and the tokenizer can do both" | |
| }, | |
| { | |
| "start": 3579.359, | |
| "text": "encode and decode uh so it feels kind of" | |
| }, | |
| { | |
| "start": 3581.88, | |
| "text": "awkward to me that it's called encoder" | |
| }, | |
| { | |
| "start": 3583.2, | |
| "text": "but that is the tokenizer and there's a" | |
| }, | |
| { | |
| "start": 3585.92, | |
| "text": "lot going on here and we're going to" | |
| }, | |
| { | |
| "start": 3587.0, | |
| "text": "step through it in detail at one point" | |
| }, | |
| { | |
| "start": 3589.24, | |
| "text": "for now I just want to focus on this" | |
| }, | |
| { | |
| "start": 3591.599, | |
| "text": "part here the create a rigix pattern" | |
| }, | |
| { | |
| "start": 3594.359, | |
| "text": "here that looks very complicated and" | |
| }, | |
| { | |
| "start": 3596.24, | |
| "text": "we're going to go through it in a bit uh" | |
| }, | |
| { | |
| "start": 3598.68, | |
| "text": "but this is the core part that allows" | |
| }, | |
| { | |
| "start": 3600.28, | |
| "text": "them to enforce rules uh for what parts" | |
| }, | |
| { | |
| "start": 3604.0, | |
| "text": "of the text Will Never Be merged for" | |
| }, | |
| { | |
| "start": 3605.96, | |
| "text": "sure now notice that re. compile here is" | |
| }, | |
| { | |
| "start": 3608.64, | |
| "text": "a little bit misleading because we're" | |
| }, | |
| { | |
| "start": 3610.76, | |
| "text": "not just doing import re which is the" | |
| }, | |
| { | |
| "start": 3612.44, | |
| "text": "python re module we're doing import reex" | |
| }, | |
| { | |
| "start": 3614.64, | |
| "text": "as re and reex is a python package that" | |
| }, | |
| { | |
| "start": 3617.72, | |
| "text": "you can install P install r x and it's" | |
| }, | |
| { | |
| "start": 3620.4, | |
| "text": "basically an extension of re so it's a" | |
| }, | |
| { | |
| "start": 3622.079, | |
| "text": "bit more powerful" | |
| }, | |
| { | |
| "start": 3623.24, | |
| "text": "re um" | |
| }, | |
| { | |
| "start": 3626.0, | |
| "text": "so let's take a look at this pattern and" | |
| }, | |
| { | |
| "start": 3628.88, | |
| "text": "what it's doing and why this is actually" | |
| }, | |
| { | |
| "start": 3630.799, | |
| "text": "doing the separation that they are" | |
| }, | |
| { | |
| "start": 3632.64, | |
| "text": "looking for okay so I've copy pasted the" | |
| }, | |
| { | |
| "start": 3634.92, | |
| "text": "pattern here to our jupit notebook where" | |
| }, | |
| { | |
| "start": 3637.119, | |
| "text": "we left off and let's take this pattern" | |
| }, | |
| { | |
| "start": 3639.24, | |
| "text": "for a spin so in the exact same way that" | |
| }, | |
| { | |
| "start": 3642.119, | |
| "text": "their code does we're going to call an" | |
| }, | |
| { | |
| "start": 3644.079, | |
| "text": "re. findall for this pattern on any" | |
| }, | |
| { | |
| "start": 3647.28, | |
| "text": "arbitrary string that we are interested" | |
| }, | |
| { | |
| "start": 3649.359, | |
| "text": "so this is the string that we want to" | |
| }, | |
| { | |
| "start": 3650.599, | |
| "text": "encode into tokens um to feed into n llm" | |
| }, | |
| { | |
| "start": 3655.24, | |
| "text": "like gpt2 so what exactly is this doing" | |
| }, | |
| { | |
| "start": 3659.039, | |
| "text": "well re. findall will take this pattern" | |
| }, | |
| { | |
| "start": 3661.039, | |
| "text": "and try to match it against a" | |
| }, | |
| { | |
| "start": 3662.839, | |
| "text": "string um the way this works is that you" | |
| }, | |
| { | |
| "start": 3666.119, | |
| "text": "are going from left to right in the" | |
| }, | |
| { | |
| "start": 3667.96, | |
| "text": "string and you're trying to match the" | |
| }, | |
| { | |
| "start": 3670.28, | |
| "text": "pattern and R.F find all will get all" | |
| }, | |
| { | |
| "start": 3673.799, | |
| "text": "the occurrences and organize them into a" | |
| }, | |
| { | |
| "start": 3676.319, | |
| "text": "list now when you look at the um when" | |
| }, | |
| { | |
| "start": 3679.16, | |
| "text": "you look at this pattern first of all" | |
| }, | |
| { | |
| "start": 3680.88, | |
| "text": "notice that this is a raw string um and" | |
| }, | |
| { | |
| "start": 3683.96, | |
| "text": "then these are three double quotes just" | |
| }, | |
| { | |
| "start": 3686.319, | |
| "text": "to start the string so really the string" | |
| }, | |
| { | |
| "start": 3688.839, | |
| "text": "itself this is the pattern itself" | |
| }, | |
| { | |
| "start": 3691.319, | |
| "text": "right and notice that it's made up of a" | |
| }, | |
| { | |
| "start": 3694.079, | |
| "text": "lot of ores so see these vertical bars" | |
| }, | |
| { | |
| "start": 3696.48, | |
| "text": "those are ores in reg X and so you go" | |
| }, | |
| { | |
| "start": 3700.2, | |
| "text": "from left to right in this pattern and" | |
| }, | |
| { | |
| "start": 3701.48, | |
| "text": "try to match it against the string" | |
| }, | |
| { | |
| "start": 3703.16, | |
| "text": "wherever you are so we have hello and" | |
| }, | |
| { | |
| "start": 3706.44, | |
| "text": "we're going to try to match it well it's" | |
| }, | |
| { | |
| "start": 3708.24, | |
| "text": "not apostrophe s it's not apostrophe t" | |
| }, | |
| { | |
| "start": 3710.799, | |
| "text": "or any of these but it is an optional" | |
| }, | |
| { | |
| "start": 3713.96, | |
| "text": "space followed by- P of uh sorry SL P of" | |
| }, | |
| { | |
| "start": 3718.119, | |
| "text": "L one or more times what is/ P of L it" | |
| }, | |
| { | |
| "start": 3722.319, | |
| "text": "is coming to some documentation that I" | |
| }, | |
| { | |
| "start": 3724.72, | |
| "text": "found um there might be other sources as" | |
| }, | |
| { | |
| "start": 3728.0, | |
| "text": "well uh SLP is a letter any kind of" | |
| }, | |
| { | |
| "start": 3731.599, | |
| "text": "letter from any language and hello is" | |
| }, | |
| { | |
| "start": 3735.039, | |
| "text": "made up of letters h e l Etc so optional" | |
| }, | |
| { | |
| "start": 3739.52, | |
| "text": "space followed by a bunch of letters one" | |
| }, | |
| { | |
| "start": 3741.559, | |
| "text": "or more letters is going to match hello" | |
| }, | |
| { | |
| "start": 3744.72, | |
| "text": "but then the match ends because a white" | |
| }, | |
| { | |
| "start": 3747.079, | |
| "text": "space is not a letter so from there on" | |
| }, | |
| { | |
| "start": 3751.079, | |
| "text": "begins a new sort of attempt to match" | |
| }, | |
| { | |
| "start": 3753.64, | |
| "text": "against the string again and starting in" | |
| }, | |
| { | |
| "start": 3756.44, | |
| "text": "here we're going to skip over all of" | |
| }, | |
| { | |
| "start": 3758.079, | |
| "text": "these again until we get to the exact" | |
| }, | |
| { | |
| "start": 3760.16, | |
| "text": "same Point again and we see that there's" | |
| }, | |
| { | |
| "start": 3762.319, | |
| "text": "an optional space this is the optional" | |
| }, | |
| { | |
| "start": 3764.279, | |
| "text": "space followed by a bunch of letters one" | |
| }, | |
| { | |
| "start": 3766.24, | |
| "text": "or more of them and so that matches so" | |
| }, | |
| { | |
| "start": 3768.72, | |
| "text": "when we run this we get a list of two" | |
| }, | |
| { | |
| "start": 3772.0, | |
| "text": "elements hello and then space world" | |
| }, | |
| { | |
| "start": 3775.72, | |
| "text": "so how are you if we add more letters we" | |
| }, | |
| { | |
| "start": 3778.88, | |
| "text": "would just get them like this now what" | |
| }, | |
| { | |
| "start": 3781.599, | |
| "text": "is this doing and why is this important" | |
| }, | |
| { | |
| "start": 3783.64, | |
| "text": "we are taking our string and instead of" | |
| }, | |
| { | |
| "start": 3785.92, | |
| "text": "directly encoding it um for" | |
| }, | |
| { | |
| "start": 3789.0, | |
| "text": "tokenization we are first splitting it" | |
| }, | |
| { | |
| "start": 3791.4, | |
| "text": "up and when you actually step through" | |
| }, | |
| { | |
| "start": 3793.48, | |
| "text": "the code and we'll do that in a bit more" | |
| }, | |
| { | |
| "start": 3795.319, | |
| "text": "detail what really is doing on a high" | |
| }, | |
| { | |
| "start": 3797.359, | |
| "text": "level is that it first splits your text" | |
| }, | |
| { | |
| "start": 3800.92, | |
| "text": "into a list of texts just like this one" | |
| }, | |
| { | |
| "start": 3804.64, | |
| "text": "and all these elements of this list are" | |
| }, | |
| { | |
| "start": 3806.559, | |
| "text": "processed independently by the tokenizer" | |
| }, | |
| { | |
| "start": 3809.279, | |
| "text": "and all of the results of that" | |
| }, | |
| { | |
| "start": 3810.76, | |
| "text": "processing are simply" | |
| }, | |
| { | |
| "start": 3812.279, | |
| "text": "concatenated so hello world oh I I" | |
| }, | |
| { | |
| "start": 3815.92, | |
| "text": "missed how hello world how are you we" | |
| }, | |
| { | |
| "start": 3819.64, | |
| "text": "have five elements of list all of these" | |
| }, | |
| { | |
| "start": 3821.599, | |
| "text": "will independent" | |
| }, | |
| { | |
| "start": 3824.4, | |
| "text": "independently go from text to a token" | |
| }, | |
| { | |
| "start": 3827.0, | |
| "text": "sequence and then that token sequence is" | |
| }, | |
| { | |
| "start": 3829.2, | |
| "text": "going to be concatenated it's all going" | |
| }, | |
| { | |
| "start": 3830.799, | |
| "text": "to be joined up and roughly speaking" | |
| }, | |
| { | |
| "start": 3834.359, | |
| "text": "what that does is you're only ever" | |
| }, | |
| { | |
| "start": 3836.119, | |
| "text": "finding merges between the elements of" | |
| }, | |
| { | |
| "start": 3838.44, | |
| "text": "this list so you can only ever consider" | |
| }, | |
| { | |
| "start": 3840.359, | |
| "text": "merges within every one of these" | |
| }, | |
| { | |
| "start": 3841.72, | |
| "text": "elements in" | |
| }, | |
| { | |
| "start": 3843.24, | |
| "text": "individually and um after you've done" | |
| }, | |
| { | |
| "start": 3846.319, | |
| "text": "all the possible merging for all of" | |
| }, | |
| { | |
| "start": 3847.92, | |
| "text": "these elements individually the results" | |
| }, | |
| { | |
| "start": 3849.88, | |
| "text": "of all that will be joined um by" | |
| }, | |
| { | |
| "start": 3853.64, | |
| "text": "concatenation and so you are basically" | |
| }, | |
| { | |
| "start": 3856.24, | |
| "text": "what what you're doing effectively is" | |
| }, | |
| { | |
| "start": 3858.4, | |
| "text": "you are never going to be merging this e" | |
| }, | |
| { | |
| "start": 3861.0, | |
| "text": "with this space because they are now" | |
| }, | |
| { | |
| "start": 3863.2, | |
| "text": "parts of the separate elements of this" | |
| }, | |
| { | |
| "start": 3865.079, | |
| "text": "list and so you are saying we are never" | |
| }, | |
| { | |
| "start": 3867.72, | |
| "text": "going to merge" | |
| }, | |
| { | |
| "start": 3868.92, | |
| "text": "eace um because we're breaking it up in" | |
| }, | |
| { | |
| "start": 3872.039, | |
| "text": "this way so basically using this regx" | |
| }, | |
| { | |
| "start": 3875.72, | |
| "text": "pattern to Chunk Up the text is just one" | |
| }, | |
| { | |
| "start": 3877.96, | |
| "text": "way of enforcing that some merges are" | |
| }, | |
| { | |
| "start": 3881.72, | |
| "text": "not to happen and we're going to go into" | |
| }, | |
| { | |
| "start": 3883.76, | |
| "text": "more of this text and we'll see that" | |
| }, | |
| { | |
| "start": 3885.2, | |
| "text": "what this is trying to do on a high" | |
| }, | |
| { | |
| "start": 3886.24, | |
| "text": "level is we're trying to not merge" | |
| }, | |
| { | |
| "start": 3888.0, | |
| "text": "across letters across numbers across" | |
| }, | |
| { | |
| "start": 3890.64, | |
| "text": "punctuation and so on so let's see in" | |
| }, | |
| { | |
| "start": 3893.2, | |
| "text": "more detail how that works so let's" | |
| }, | |
| { | |
| "start": 3894.72, | |
| "text": "continue now we have/ P ofn if you go to" | |
| }, | |
| { | |
| "start": 3898.0, | |
| "text": "the documentation SLP of n is any kind" | |
| }, | |
| { | |
| "start": 3901.839, | |
| "text": "of numeric character in any script so" | |
| }, | |
| { | |
| "start": 3904.44, | |
| "text": "it's numbers so we have an optional" | |
| }, | |
| { | |
| "start": 3906.599, | |
| "text": "space followed by numbers and those" | |
| }, | |
| { | |
| "start": 3908.119, | |
| "text": "would be separated out so letters and" | |
| }, | |
| { | |
| "start": 3910.359, | |
| "text": "numbers are being separated so if I do" | |
| }, | |
| { | |
| "start": 3912.559, | |
| "text": "Hello World 123 how are you then world" | |
| }, | |
| { | |
| "start": 3915.839, | |
| "text": "will stop matching here because one is" | |
| }, | |
| { | |
| "start": 3917.96, | |
| "text": "not a letter anymore but one is a number" | |
| }, | |
| { | |
| "start": 3920.64, | |
| "text": "so this group will match for that and" | |
| }, | |
| { | |
| "start": 3922.52, | |
| "text": "we'll get it as a separate entity" | |
| }, | |
| { | |
| "start": 3926.559, | |
| "text": "uh let's see how these apostrophes work" | |
| }, | |
| { | |
| "start": 3928.359, | |
| "text": "so here if we have" | |
| }, | |
| { | |
| "start": 3931.0, | |
| "text": "um uh Slash V or I mean apostrophe V as" | |
| }, | |
| { | |
| "start": 3935.079, | |
| "text": "an example then apostrophe here is not a" | |
| }, | |
| { | |
| "start": 3938.359, | |
| "text": "letter or a" | |
| }, | |
| { | |
| "start": 3939.52, | |
| "text": "number so hello will stop matching and" | |
| }, | |
| { | |
| "start": 3942.44, | |
| "text": "then we will exactly match this with" | |
| }, | |
| { | |
| "start": 3944.96, | |
| "text": "that so that will come out as a separate" | |
| }, | |
| { | |
| "start": 3948.2, | |
| "text": "thing so why are they doing the" | |
| }, | |
| { | |
| "start": 3950.24, | |
| "text": "apostrophes here honestly I think that" | |
| }, | |
| { | |
| "start": 3952.24, | |
| "text": "these are just like very common" | |
| }, | |
| { | |
| "start": 3953.599, | |
| "text": "apostrophes p uh that are used um" | |
| }, | |
| { | |
| "start": 3956.96, | |
| "text": "typically I don't love that they've done" | |
| }, | |
| { | |
| "start": 3959.359, | |
| "text": "this" | |
| }, | |
| { | |
| "start": 3960.599, | |
| "text": "because uh let me show you what happens" | |
| }, | |
| { | |
| "start": 3963.319, | |
| "text": "when you have uh some Unicode" | |
| }, | |
| { | |
| "start": 3965.44, | |
| "text": "apostrophes like for example you can" | |
| }, | |
| { | |
| "start": 3967.359, | |
| "text": "have if you have house then this will be" | |
| }, | |
| { | |
| "start": 3970.559, | |
| "text": "separated out because of this matching" | |
| }, | |
| { | |
| "start": 3973.039, | |
| "text": "but if you use the Unicode apostrophe" | |
| }, | |
| { | |
| "start": 3975.319, | |
| "text": "like" | |
| }, | |
| { | |
| "start": 3976.16, | |
| "text": "this then suddenly this does not work" | |
| }, | |
| { | |
| "start": 3979.839, | |
| "text": "and so this apostrophe will actually" | |
| }, | |
| { | |
| "start": 3981.559, | |
| "text": "become its own thing now and so so um" | |
| }, | |
| { | |
| "start": 3984.92, | |
| "text": "it's basically hardcoded for this" | |
| }, | |
| { | |
| "start": 3986.359, | |
| "text": "specific kind of apostrophe and uh" | |
| }, | |
| { | |
| "start": 3989.68, | |
| "text": "otherwise they become completely" | |
| }, | |
| { | |
| "start": 3991.319, | |
| "text": "separate tokens in addition to this you" | |
| }, | |
| { | |
| "start": 3994.039, | |
| "text": "can go to the gpt2 docs and here when" | |
| }, | |
| { | |
| "start": 3998.48, | |
| "text": "they Define the pattern they say should" | |
| }, | |
| { | |
| "start": 4000.2, | |
| "text": "have added re. ignore case so BP merges" | |
| }, | |
| { | |
| "start": 4003.0, | |
| "text": "can happen for capitalized versions of" | |
| }, | |
| { | |
| "start": 4004.559, | |
| "text": "contractions so what they're pointing" | |
| }, | |
| { | |
| "start": 4006.52, | |
| "text": "out is that you see how this is" | |
| }, | |
| { | |
| "start": 4007.72, | |
| "text": "apostrophe and then lowercase letters" | |
| }, | |
| { | |
| "start": 4010.839, | |
| "text": "well because they didn't do re. ignore" | |
| }, | |
| { | |
| "start": 4012.92, | |
| "text": "case then then um these rules will not" | |
| }, | |
| { | |
| "start": 4016.44, | |
| "text": "separate out the apostrophes if it's" | |
| }, | |
| { | |
| "start": 4018.88, | |
| "text": "uppercase so" | |
| }, | |
| { | |
| "start": 4021.44, | |
| "text": "house would be like this but if I did" | |
| }, | |
| { | |
| "start": 4026.64, | |
| "text": "house if I'm uppercase then notice" | |
| }, | |
| { | |
| "start": 4030.24, | |
| "text": "suddenly the apostrophe comes by" | |
| }, | |
| { | |
| "start": 4032.279, | |
| "text": "itself so the tokenization will work" | |
| }, | |
| { | |
| "start": 4035.48, | |
| "text": "differently in uppercase and lower case" | |
| }, | |
| { | |
| "start": 4037.44, | |
| "text": "inconsistently separating out these" | |
| }, | |
| { | |
| "start": 4039.039, | |
| "text": "apostrophes so it feels extremely gnarly" | |
| }, | |
| { | |
| "start": 4041.119, | |
| "text": "and slightly gross um but that's that's" | |
| }, | |
| { | |
| "start": 4044.52, | |
| "text": "how that works okay so let's come back" | |
| }, | |
| { | |
| "start": 4047.24, | |
| "text": "after trying to match a bunch of" | |
| }, | |
| { | |
| "start": 4048.44, | |
| "text": "apostrophe Expressions by the way the" | |
| }, | |
| { | |
| "start": 4050.279, | |
| "text": "other issue here is that these are quite" | |
| }, | |
| { | |
| "start": 4052.079, | |
| "text": "language specific probably so I don't" | |
| }, | |
| { | |
| "start": 4054.559, | |
| "text": "know that all the languages for example" | |
| }, | |
| { | |
| "start": 4055.799, | |
| "text": "use or don't use apostrophes but that" | |
| }, | |
| { | |
| "start": 4057.48, | |
| "text": "would be inconsistently tokenized as a" | |
| }, | |
| { | |
| "start": 4059.96, | |
| "text": "result then we try to match letters then" | |
| }, | |
| { | |
| "start": 4062.52, | |
| "text": "we try to match numbers and then if that" | |
| }, | |
| { | |
| "start": 4064.88, | |
| "text": "doesn't work we fall back to here and" | |
| }, | |
| { | |
| "start": 4067.559, | |
| "text": "what this is saying is again optional" | |
| }, | |
| { | |
| "start": 4069.16, | |
| "text": "space followed by something that is not" | |
| }, | |
| { | |
| "start": 4070.839, | |
| "text": "a letter number or a space in one or" | |
| }, | |
| { | |
| "start": 4073.96, | |
| "text": "more of that so what this is doing" | |
| }, | |
| { | |
| "start": 4075.799, | |
| "text": "effectively is this is trying to match" | |
| }, | |
| { | |
| "start": 4077.559, | |
| "text": "punctuation roughly speaking not letters" | |
| }, | |
| { | |
| "start": 4079.52, | |
| "text": "and not numbers so this group will try" | |
| }, | |
| { | |
| "start": 4082.279, | |
| "text": "to trigger for that so if I do something" | |
| }, | |
| { | |
| "start": 4084.2, | |
| "text": "like this then these parts here are not" | |
| }, | |
| { | |
| "start": 4088.48, | |
| "text": "letters or numbers but they will" | |
| }, | |
| { | |
| "start": 4089.96, | |
| "text": "actually they are uh they will actually" | |
| }, | |
| { | |
| "start": 4092.039, | |
| "text": "get caught here and so they become its" | |
| }, | |
| { | |
| "start": 4094.48, | |
| "text": "own group so we've separated out the" | |
| }, | |
| { | |
| "start": 4097.4, | |
| "text": "punctuation and finally this um this is" | |
| }, | |
| { | |
| "start": 4100.08, | |
| "text": "also a little bit confusing so this is" | |
| }, | |
| { | |
| "start": 4102.159, | |
| "text": "matching white space but this is using a" | |
| }, | |
| { | |
| "start": 4105.359, | |
| "text": "negative look ahead assertion in regex" | |
| }, | |
| { | |
| "start": 4109.04, | |
| "text": "so what this is doing is it's matching" | |
| }, | |
| { | |
| "start": 4110.92, | |
| "text": "wh space up to but not including the" | |
| }, | |
| { | |
| "start": 4113.279, | |
| "text": "last Whit space" | |
| }, | |
| { | |
| "start": 4115.0, | |
| "text": "character why is this important um this" | |
| }, | |
| { | |
| "start": 4117.92, | |
| "text": "is pretty subtle I think so you see how" | |
| }, | |
| { | |
| "start": 4120.279, | |
| "text": "the white space is always included at" | |
| }, | |
| { | |
| "start": 4121.719, | |
| "text": "the beginning of the word so um space r" | |
| }, | |
| { | |
| "start": 4125.52, | |
| "text": "space u Etc suppose we have a lot of" | |
| }, | |
| { | |
| "start": 4128.08, | |
| "text": "spaces" | |
| }, | |
| { | |
| "start": 4129.4, | |
| "text": "here what's going to happen here is that" | |
| }, | |
| { | |
| "start": 4132.359, | |
| "text": "these spaces up to not including the" | |
| }, | |
| { | |
| "start": 4134.6, | |
| "text": "last character will get caught by this" | |
| }, | |
| { | |
| "start": 4137.92, | |
| "text": "and what that will do is it will" | |
| }, | |
| { | |
| "start": 4139.719, | |
| "text": "separate out the spaces up to but not" | |
| }, | |
| { | |
| "start": 4141.88, | |
| "text": "including the last character so that the" | |
| }, | |
| { | |
| "start": 4143.679, | |
| "text": "last character can come here and join" | |
| }, | |
| { | |
| "start": 4145.92, | |
| "text": "with the um space you and the reason" | |
| }, | |
| { | |
| "start": 4149.239, | |
| "text": "that's nice is because space you is the" | |
| }, | |
| { | |
| "start": 4151.44, | |
| "text": "common token so if I didn't have these" | |
| }, | |
| { | |
| "start": 4153.799, | |
| "text": "Extra Spaces here you would just have" | |
| }, | |
| { | |
| "start": 4155.44, | |
| "text": "space you and if I add tokens if I add" | |
| }, | |
| { | |
| "start": 4158.159, | |
| "text": "spaces we still have a space view but" | |
| }, | |
| { | |
| "start": 4160.719, | |
| "text": "now we have all this extra white space" | |
| }, | |
| { | |
| "start": 4162.96, | |
| "text": "so basically the GB to tokenizer really" | |
| }, | |
| { | |
| "start": 4164.719, | |
| "text": "likes to have a space letters or numbers" | |
| }, | |
| { | |
| "start": 4167.44, | |
| "text": "um and it it preens these spaces and" | |
| }, | |
| { | |
| "start": 4170.44, | |
| "text": "this is just something that it is" | |
| }, | |
| { | |
| "start": 4171.4, | |
| "text": "consistent about so that's what that is" | |
| }, | |
| { | |
| "start": 4173.679, | |
| "text": "for and then finally we have all the the" | |
| }, | |
| { | |
| "start": 4176.4, | |
| "text": "last fallback is um whites space" | |
| }, | |
| { | |
| "start": 4178.64, | |
| "text": "characters uh so um that would be" | |
| }, | |
| { | |
| "start": 4182.719, | |
| "text": "just um if that doesn't get caught then" | |
| }, | |
| { | |
| "start": 4186.679, | |
| "text": "this thing will catch any trailing" | |
| }, | |
| { | |
| "start": 4188.52, | |
| "text": "spaces and so on I wanted to show one" | |
| }, | |
| { | |
| "start": 4190.759, | |
| "text": "more real world example here so if we" | |
| }, | |
| { | |
| "start": 4193.159, | |
| "text": "have this string which is a piece of" | |
| }, | |
| { | |
| "start": 4194.44, | |
| "text": "python code and then we try to split it" | |
| }, | |
| { | |
| "start": 4196.36, | |
| "text": "up then this is the kind of output we" | |
| }, | |
| { | |
| "start": 4198.4, | |
| "text": "get so you'll notice that the list has" | |
| }, | |
| { | |
| "start": 4200.56, | |
| "text": "many elements here and that's because we" | |
| }, | |
| { | |
| "start": 4202.48, | |
| "text": "are splitting up fairly often uh every" | |
| }, | |
| { | |
| "start": 4205.12, | |
| "text": "time sort of a category" | |
| }, | |
| { | |
| "start": 4207.12, | |
| "text": "changes um so there will never be any" | |
| }, | |
| { | |
| "start": 4209.36, | |
| "text": "merges Within These" | |
| }, | |
| { | |
| "start": 4210.96, | |
| "text": "elements and um that's what you are" | |
| }, | |
| { | |
| "start": 4213.48, | |
| "text": "seeing here now you might think that in" | |
| }, | |
| { | |
| "start": 4216.44, | |
| "text": "order to train the" | |
| }, | |
| { | |
| "start": 4217.76, | |
| "text": "tokenizer uh open AI has used this to" | |
| }, | |
| { | |
| "start": 4221.12, | |
| "text": "split up text into chunks and then run" | |
| }, | |
| { | |
| "start": 4223.88, | |
| "text": "just a BP algorithm within all the" | |
| }, | |
| { | |
| "start": 4225.8, | |
| "text": "chunks but that is not exactly what" | |
| }, | |
| { | |
| "start": 4227.96, | |
| "text": "happened and the reason is the following" | |
| }, | |
| { | |
| "start": 4230.28, | |
| "text": "notice that we have the spaces here uh" | |
| }, | |
| { | |
| "start": 4233.32, | |
| "text": "those Spaces end up being entire" | |
| }, | |
| { | |
| "start": 4235.44, | |
| "text": "elements but these spaces never actually" | |
| }, | |
| { | |
| "start": 4238.36, | |
| "text": "end up being merged by by open Ai and" | |
| }, | |
| { | |
| "start": 4240.64, | |
| "text": "the way you can tell is that if you copy" | |
| }, | |
| { | |
| "start": 4242.48, | |
| "text": "paste the exact same chunk here into Tik" | |
| }, | |
| { | |
| "start": 4244.199, | |
| "text": "token U Tik tokenizer you see that all" | |
| }, | |
| { | |
| "start": 4247.28, | |
| "text": "the spaces are kept independent and" | |
| }, | |
| { | |
| "start": 4249.28, | |
| "text": "they're all token" | |
| }, | |
| { | |
| "start": 4251.0, | |
| "text": "220 so I think opena at some point Point" | |
| }, | |
| { | |
| "start": 4253.84, | |
| "text": "en Force some rule that these spaces" | |
| }, | |
| { | |
| "start": 4256.04, | |
| "text": "would never be merged and so um there's" | |
| }, | |
| { | |
| "start": 4259.4, | |
| "text": "some additional rules on top of just" | |
| }, | |
| { | |
| "start": 4261.28, | |
| "text": "chunking and bpe that open ey is not uh" | |
| }, | |
| { | |
| "start": 4264.199, | |
| "text": "clear about now the training code for" | |
| }, | |
| { | |
| "start": 4266.32, | |
| "text": "the gpt2 tokenizer was never released so" | |
| }, | |
| { | |
| "start": 4268.679, | |
| "text": "all we have is uh the code that I've" | |
| }, | |
| { | |
| "start": 4270.8, | |
| "text": "already shown you but this code here" | |
| }, | |
| { | |
| "start": 4273.28, | |
| "text": "that they've released is only the" | |
| }, | |
| { | |
| "start": 4274.4, | |
| "text": "inference code for the tokens so this is" | |
| }, | |
| { | |
| "start": 4277.679, | |
| "text": "not the training code you can't give it" | |
| }, | |
| { | |
| "start": 4279.08, | |
| "text": "a piece of text and training tokenizer" | |
| }, | |
| { | |
| "start": 4281.52, | |
| "text": "this is just the inference code which" | |
| }, | |
| { | |
| "start": 4283.32, | |
| "text": "Tak takes the merges that we have up" | |
| }, | |
| { | |
| "start": 4285.6, | |
| "text": "above and applies them to a new piece of" | |
| }, | |
| { | |
| "start": 4288.32, | |
| "text": "text and so we don't know exactly how" | |
| }, | |
| { | |
| "start": 4290.56, | |
| "text": "opening ey trained um train the" | |
| }, | |
| { | |
| "start": 4292.48, | |
| "text": "tokenizer but it wasn't as simple as" | |
| }, | |
| { | |
| "start": 4294.64, | |
| "text": "chunk it up and BP it uh whatever it was" | |
| }, | |
| { | |
| "start": 4298.36, | |
| "text": "next I wanted to introduce you to the" | |
| }, | |
| { | |
| "start": 4300.239, | |
| "text": "Tik token library from openai which is" | |
| }, | |
| { | |
| "start": 4302.48, | |
| "text": "the official library for tokenization" | |
| }, | |
| { | |
| "start": 4304.8, | |
| "text": "from openai so this is Tik token bip" | |
| }, | |
| { | |
| "start": 4308.36, | |
| "text": "install P to Tik token and then um you" | |
| }, | |
| { | |
| "start": 4311.44, | |
| "text": "can do the tokenization in inference" | |
| }, | |
| { | |
| "start": 4314.36, | |
| "text": "this is again not training code this is" | |
| }, | |
| { | |
| "start": 4315.88, | |
| "text": "only inference code for" | |
| }, | |
| { | |
| "start": 4317.92, | |
| "text": "tokenization um I wanted to show you how" | |
| }, | |
| { | |
| "start": 4320.36, | |
| "text": "you would use it quite simple and" | |
| }, | |
| { | |
| "start": 4322.48, | |
| "text": "running this just gives us the gpt2" | |
| }, | |
| { | |
| "start": 4324.36, | |
| "text": "tokens or the GPT 4 tokens so this is" | |
| }, | |
| { | |
| "start": 4326.92, | |
| "text": "the tokenizer use for GPT 4 and so in" | |
| }, | |
| { | |
| "start": 4329.679, | |
| "text": "particular we see that the Whit space in" | |
| }, | |
| { | |
| "start": 4331.239, | |
| "text": "gpt2 remains unmerged but in GPT 4 uh" | |
| }, | |
| { | |
| "start": 4334.48, | |
| "text": "these Whit spaces merge as we also saw" | |
| }, | |
| { | |
| "start": 4337.32, | |
| "text": "in this one where here they're all" | |
| }, | |
| { | |
| "start": 4339.44, | |
| "text": "unmerged but if we go down to GPT 4 uh" | |
| }, | |
| { | |
| "start": 4342.639, | |
| "text": "they become merged" | |
| }, | |
| { | |
| "start": 4345.239, | |
| "text": "um now in the" | |
| }, | |
| { | |
| "start": 4347.76, | |
| "text": "gp4 uh tokenizer they changed the" | |
| }, | |
| { | |
| "start": 4351.04, | |
| "text": "regular expression that they use to" | |
| }, | |
| { | |
| "start": 4353.12, | |
| "text": "Chunk Up text so the way to see this is" | |
| }, | |
| { | |
| "start": 4355.639, | |
| "text": "that if you come to your the Tik token" | |
| }, | |
| { | |
| "start": 4358.0, | |
| "text": "uh library and then you go to this file" | |
| }, | |
| { | |
| "start": 4361.08, | |
| "text": "Tik token X openi public this is where" | |
| }, | |
| { | |
| "start": 4364.12, | |
| "text": "sort of like the definition of all these" | |
| }, | |
| { | |
| "start": 4365.639, | |
| "text": "different tokenizers that openi" | |
| }, | |
| { | |
| "start": 4366.96, | |
| "text": "maintains is and so uh necessarily to do" | |
| }, | |
| { | |
| "start": 4370.56, | |
| "text": "the inference they had to publish some" | |
| }, | |
| { | |
| "start": 4371.76, | |
| "text": "of the details about the strings" | |
| }, | |
| { | |
| "start": 4373.96, | |
| "text": "so this is the string that we already" | |
| }, | |
| { | |
| "start": 4375.36, | |
| "text": "saw for gpt2 it is slightly different" | |
| }, | |
| { | |
| "start": 4378.36, | |
| "text": "but it is actually equivalent uh to what" | |
| }, | |
| { | |
| "start": 4380.36, | |
| "text": "we discussed here so this pattern that" | |
| }, | |
| { | |
| "start": 4382.84, | |
| "text": "we discussed is equivalent to this" | |
| }, | |
| { | |
| "start": 4384.96, | |
| "text": "pattern this one just executes a little" | |
| }, | |
| { | |
| "start": 4387.0, | |
| "text": "bit faster so here you see a little bit" | |
| }, | |
| { | |
| "start": 4389.239, | |
| "text": "of a slightly different definition but" | |
| }, | |
| { | |
| "start": 4390.719, | |
| "text": "otherwise it's the same we're going to" | |
| }, | |
| { | |
| "start": 4392.719, | |
| "text": "go into special tokens in a bit and then" | |
| }, | |
| { | |
| "start": 4395.32, | |
| "text": "if you scroll down to CL 100k this is" | |
| }, | |
| { | |
| "start": 4398.6, | |
| "text": "the GPT 4 tokenizer you see that the" | |
| }, | |
| { | |
| "start": 4400.76, | |
| "text": "pattern has changed um and this is kind" | |
| }, | |
| { | |
| "start": 4403.96, | |
| "text": "of like the main the major change in" | |
| }, | |
| { | |
| "start": 4406.08, | |
| "text": "addition to a bunch of other special" | |
| }, | |
| { | |
| "start": 4407.36, | |
| "text": "tokens which I'll go into in a bit again" | |
| }, | |
| { | |
| "start": 4410.4, | |
| "text": "now some I'm not going to actually go" | |
| }, | |
| { | |
| "start": 4411.84, | |
| "text": "into the full detail of the pattern" | |
| }, | |
| { | |
| "start": 4413.28, | |
| "text": "change because honestly this is my" | |
| }, | |
| { | |
| "start": 4415.44, | |
| "text": "numbing uh I would just advise that you" | |
| }, | |
| { | |
| "start": 4417.44, | |
| "text": "pull out chat GPT and the regex" | |
| }, | |
| { | |
| "start": 4419.88, | |
| "text": "documentation and just step through it" | |
| }, | |
| { | |
| "start": 4422.159, | |
| "text": "but really the major changes are number" | |
| }, | |
| { | |
| "start": 4424.52, | |
| "text": "one you see this eye here that means" | |
| }, | |
| { | |
| "start": 4428.08, | |
| "text": "that the um case sensitivity this is" | |
| }, | |
| { | |
| "start": 4431.08, | |
| "text": "case insensitive match and so the" | |
| }, | |
| { | |
| "start": 4433.679, | |
| "text": "comment that we saw earlier on oh we" | |
| }, | |
| { | |
| "start": 4436.12, | |
| "text": "should have used re. uppercase uh" | |
| }, | |
| { | |
| "start": 4438.4, | |
| "text": "basically we're now going to be matching" | |
| }, | |
| { | |
| "start": 4441.8, | |
| "text": "these apostrophe s apostrophe D" | |
| }, | |
| { | |
| "start": 4444.6, | |
| "text": "apostrophe M Etc uh we're going to be" | |
| }, | |
| { | |
| "start": 4446.92, | |
| "text": "matching them both in lowercase and in" | |
| }, | |
| { | |
| "start": 4448.6, | |
| "text": "uppercase so that's fixed there's a" | |
| }, | |
| { | |
| "start": 4451.32, | |
| "text": "bunch of different like handling of the" | |
| }, | |
| { | |
| "start": 4452.76, | |
| "text": "whites space that I'm not going to go" | |
| }, | |
| { | |
| "start": 4454.08, | |
| "text": "into the full details of and then one" | |
| }, | |
| { | |
| "start": 4456.48, | |
| "text": "more thing here is you will notice that" | |
| }, | |
| { | |
| "start": 4458.639, | |
| "text": "when they match the numbers they only" | |
| }, | |
| { | |
| "start": 4460.679, | |
| "text": "match one to three numbers so so they" | |
| }, | |
| { | |
| "start": 4463.56, | |
| "text": "will never merge" | |
| }, | |
| { | |
| "start": 4466.12, | |
| "text": "numbers that are in low in more than" | |
| }, | |
| { | |
| "start": 4468.88, | |
| "text": "three digits only up to three digits of" | |
| }, | |
| { | |
| "start": 4471.159, | |
| "text": "numbers will ever be merged and uh" | |
| }, | |
| { | |
| "start": 4474.679, | |
| "text": "that's one change that they made as well" | |
| }, | |
| { | |
| "start": 4476.32, | |
| "text": "to prevent uh tokens that are very very" | |
| }, | |
| { | |
| "start": 4478.6, | |
| "text": "long number" | |
| }, | |
| { | |
| "start": 4480.0, | |
| "text": "sequences uh but again we don't really" | |
| }, | |
| { | |
| "start": 4482.08, | |
| "text": "know why they do any of this stuff uh" | |
| }, | |
| { | |
| "start": 4484.199, | |
| "text": "because none of this is documented and" | |
| }, | |
| { | |
| "start": 4486.28, | |
| "text": "uh it's just we just get the pattern so" | |
| }, | |
| { | |
| "start": 4489.52, | |
| "text": "um yeah it is what it is but those are" | |
| }, | |
| { | |
| "start": 4491.76, | |
| "text": "some of the changes that gp4 has made" | |
| }, | |
| { | |
| "start": 4494.36, | |
| "text": "and of course the vocabulary size went" | |
| }, | |
| { | |
| "start": 4496.36, | |
| "text": "from roughly 50k to roughly" | |
| }, | |
| { | |
| "start": 4498.4, | |
| "text": "100K the next thing I would like to do" | |
| }, | |
| { | |
| "start": 4500.4, | |
| "text": "very briefly is to take you through the" | |
| }, | |
| { | |
| "start": 4502.32, | |
| "text": "gpt2 encoder dopy that openi has" | |
| }, | |
| { | |
| "start": 4505.4, | |
| "text": "released uh this is the file that I" | |
| }, | |
| { | |
| "start": 4507.36, | |
| "text": "already mentioned to you briefly now" | |
| }, | |
| { | |
| "start": 4509.639, | |
| "text": "this file is uh fairly short and should" | |
| }, | |
| { | |
| "start": 4512.84, | |
| "text": "be relatively understandable to you at" | |
| }, | |
| { | |
| "start": 4514.639, | |
| "text": "this point um starting at the bottom" | |
| }, | |
| { | |
| "start": 4517.96, | |
| "text": "here they are loading two files encoder" | |
| }, | |
| { | |
| "start": 4521.48, | |
| "text": "Json and vocab bpe and they do some" | |
| }, | |
| { | |
| "start": 4524.159, | |
| "text": "light processing on it and then they" | |
| }, | |
| { | |
| "start": 4525.4, | |
| "text": "call this encoder object which is the" | |
| }, | |
| { | |
| "start": 4527.719, | |
| "text": "tokenizer now if you'd like to inspect" | |
| }, | |
| { | |
| "start": 4530.12, | |
| "text": "these two files which together" | |
| }, | |
| { | |
| "start": 4531.96, | |
| "text": "constitute their saved tokenizer then" | |
| }, | |
| { | |
| "start": 4534.56, | |
| "text": "you can do that with a piece of code" | |
| }, | |
| { | |
| "start": 4536.12, | |
| "text": "like" | |
| }, | |
| { | |
| "start": 4536.84, | |
| "text": "this um this is where you can download" | |
| }, | |
| { | |
| "start": 4539.32, | |
| "text": "these two files and you can inspect them" | |
| }, | |
| { | |
| "start": 4540.8, | |
| "text": "if you'd like and what you will find is" | |
| }, | |
| { | |
| "start": 4542.88, | |
| "text": "that this encoder as they call it in" | |
| }, | |
| { | |
| "start": 4545.08, | |
| "text": "their code is exactly equivalent to our" | |
| }, | |
| { | |
| "start": 4547.639, | |
| "text": "vocab so remember here where we have" | |
| }, | |
| { | |
| "start": 4551.8, | |
| "text": "this vocab object which allowed us us to" | |
| }, | |
| { | |
| "start": 4553.48, | |
| "text": "decode very efficiently and basically it" | |
| }, | |
| { | |
| "start": 4556.0, | |
| "text": "took us from the integer to the byes uh" | |
| }, | |
| { | |
| "start": 4560.12, | |
| "text": "for that integer so our vocab is exactly" | |
| }, | |
| { | |
| "start": 4563.32, | |
| "text": "their encoder and then their vocab bpe" | |
| }, | |
| { | |
| "start": 4567.76, | |
| "text": "confusingly is actually are merges so" | |
| }, | |
| { | |
| "start": 4571.159, | |
| "text": "their BP merges which is based on the" | |
| }, | |
| { | |
| "start": 4574.0, | |
| "text": "data inside vocab bpe ends up being" | |
| }, | |
| { | |
| "start": 4576.679, | |
| "text": "equivalent to our merges so uh basically" | |
| }, | |
| { | |
| "start": 4580.679, | |
| "text": "they are saving and loading the two uh" | |
| }, | |
| { | |
| "start": 4584.36, | |
| "text": "variables that for us are also critical" | |
| }, | |
| { | |
| "start": 4586.239, | |
| "text": "the merges variable and the vocab" | |
| }, | |
| { | |
| "start": 4588.32, | |
| "text": "variable using just these two variables" | |
| }, | |
| { | |
| "start": 4591.12, | |
| "text": "you can represent a tokenizer and you" | |
| }, | |
| { | |
| "start": 4592.56, | |
| "text": "can both do encoding and decoding once" | |
| }, | |
| { | |
| "start": 4594.52, | |
| "text": "you've trained this" | |
| }, | |
| { | |
| "start": 4596.0, | |
| "text": "tokenizer now the only thing that um is" | |
| }, | |
| { | |
| "start": 4600.0, | |
| "text": "actually slightly confusing inside what" | |
| }, | |
| { | |
| "start": 4602.56, | |
| "text": "opening ey does here is that in addition" | |
| }, | |
| { | |
| "start": 4604.52, | |
| "text": "to this encoder and a decoder they also" | |
| }, | |
| { | |
| "start": 4606.88, | |
| "text": "have something called a bite encoder and" | |
| }, | |
| { | |
| "start": 4608.52, | |
| "text": "a bite decoder and this is actually" | |
| }, | |
| { | |
| "start": 4611.28, | |
| "text": "unfortunately just" | |
| }, | |
| { | |
| "start": 4613.96, | |
| "text": "kind of a spirous implementation detail" | |
| }, | |
| { | |
| "start": 4615.88, | |
| "text": "and isn't actually deep or interesting" | |
| }, | |
| { | |
| "start": 4617.719, | |
| "text": "in any way so I'm going to skip the" | |
| }, | |
| { | |
| "start": 4619.08, | |
| "text": "discussion of it but what opening ey" | |
| }, | |
| { | |
| "start": 4621.04, | |
| "text": "does here for reasons that I don't fully" | |
| }, | |
| { | |
| "start": 4622.8, | |
| "text": "understand is that not only have they" | |
| }, | |
| { | |
| "start": 4625.0, | |
| "text": "this tokenizer which can encode and" | |
| }, | |
| { | |
| "start": 4626.44, | |
| "text": "decode but they have a whole separate" | |
| }, | |
| { | |
| "start": 4628.159, | |
| "text": "layer here in addition that is used" | |
| }, | |
| { | |
| "start": 4630.0, | |
| "text": "serially with the tokenizer and so you" | |
| }, | |
| { | |
| "start": 4632.639, | |
| "text": "first do um bite encode and then encode" | |
| }, | |
| { | |
| "start": 4636.08, | |
| "text": "and then you do decode and then bite" | |
| }, | |
| { | |
| "start": 4637.679, | |
| "text": "decode so that's the loop and they are" | |
| }, | |
| { | |
| "start": 4640.239, | |
| "text": "just stacked serial on top of each other" | |
| }, | |
| { | |
| "start": 4642.84, | |
| "text": "and and it's not that interesting so I" | |
| }, | |
| { | |
| "start": 4644.719, | |
| "text": "won't cover it and you can step through" | |
| }, | |
| { | |
| "start": 4645.96, | |
| "text": "it if you'd like otherwise this file if" | |
| }, | |
| { | |
| "start": 4648.639, | |
| "text": "you ignore the bite encoder and the bite" | |
| }, | |
| { | |
| "start": 4650.239, | |
| "text": "decoder will be algorithmically very" | |
| }, | |
| { | |
| "start": 4651.88, | |
| "text": "familiar with you and the meat of it" | |
| }, | |
| { | |
| "start": 4653.96, | |
| "text": "here is the what they call bpe function" | |
| }, | |
| { | |
| "start": 4657.04, | |
| "text": "and you should recognize this Loop here" | |
| }, | |
| { | |
| "start": 4659.639, | |
| "text": "which is very similar to our own y Loop" | |
| }, | |
| { | |
| "start": 4661.96, | |
| "text": "where they're trying to identify the" | |
| }, | |
| { | |
| "start": 4663.52, | |
| "text": "Byram uh a pair that they should be" | |
| }, | |
| { | |
| "start": 4666.96, | |
| "text": "merging next and then here just like we" | |
| }, | |
| { | |
| "start": 4669.159, | |
| "text": "had they have a for Loop trying to merge" | |
| }, | |
| { | |
| "start": 4670.96, | |
| "text": "this pair uh so they will go over all of" | |
| }, | |
| { | |
| "start": 4673.6, | |
| "text": "the sequence and they will merge the" | |
| }, | |
| { | |
| "start": 4675.12, | |
| "text": "pair whenever they find it and they keep" | |
| }, | |
| { | |
| "start": 4677.84, | |
| "text": "repeating that until they run out of" | |
| }, | |
| { | |
| "start": 4679.8, | |
| "text": "possible merges in the in the text so" | |
| }, | |
| { | |
| "start": 4682.36, | |
| "text": "that's the meat of this file and uh" | |
| }, | |
| { | |
| "start": 4684.56, | |
| "text": "there's an encode and a decode function" | |
| }, | |
| { | |
| "start": 4686.04, | |
| "text": "just like we have implemented it so long" | |
| }, | |
| { | |
| "start": 4688.159, | |
| "text": "story short what I want you to take away" | |
| }, | |
| { | |
| "start": 4689.719, | |
| "text": "at this point is that unfortunately it's" | |
| }, | |
| { | |
| "start": 4691.639, | |
| "text": "a little bit of a messy code that they" | |
| }, | |
| { | |
| "start": 4693.0, | |
| "text": "have but algorithmically it is identical" | |
| }, | |
| { | |
| "start": 4695.12, | |
| "text": "to what we've built up above and what" | |
| }, | |
| { | |
| "start": 4697.719, | |
| "text": "we've built up above if you understand" | |
| }, | |
| { | |
| "start": 4699.159, | |
| "text": "it is algorithmically what is necessary" | |
| }, | |
| { | |
| "start": 4701.32, | |
| "text": "to actually build a BP to organizer" | |
| }, | |
| { | |
| "start": 4703.719, | |
| "text": "train it and then both encode and decode" | |
| }, | |
| { | |
| "start": 4706.84, | |
| "text": "the next topic I would like to turn to" | |
| }, | |
| { | |
| "start": 4708.28, | |
| "text": "is that of special tokens so in addition" | |
| }, | |
| { | |
| "start": 4710.92, | |
| "text": "to tokens that are coming from you know" | |
| }, | |
| { | |
| "start": 4712.6, | |
| "text": "raw bytes and the BP merges we can" | |
| }, | |
| { | |
| "start": 4715.239, | |
| "text": "insert all kinds of tokens that we are" | |
| }, | |
| { | |
| "start": 4716.8, | |
| "text": "going to use to delimit different parts" | |
| }, | |
| { | |
| "start": 4718.96, | |
| "text": "of the data or introduced to create a" | |
| }, | |
| { | |
| "start": 4721.04, | |
| "text": "special structure of the token streams" | |
| }, | |
| { | |
| "start": 4724.8, | |
| "text": "so in uh if you look at this encoder" | |
| }, | |
| { | |
| "start": 4727.48, | |
| "text": "object from open AIS gpd2 right here we" | |
| }, | |
| { | |
| "start": 4730.88, | |
| "text": "mentioned this is very similar to our" | |
| }, | |
| { | |
| "start": 4732.159, | |
| "text": "vocab you'll notice that the length of" | |
| }, | |
| { | |
| "start": 4734.84, | |
| "text": "this is" | |
| }, | |
| { | |
| "start": 4738.88, | |
| "text": "50257 and as I mentioned it's mapping uh" | |
| }, | |
| { | |
| "start": 4741.84, | |
| "text": "and it's inverted from the mapping of" | |
| }, | |
| { | |
| "start": 4743.36, | |
| "text": "our vocab our vocab goes from integer to" | |
| }, | |
| { | |
| "start": 4746.12, | |
| "text": "string and they go the other way around" | |
| }, | |
| { | |
| "start": 4748.08, | |
| "text": "for no amazing reason um but the thing" | |
| }, | |
| { | |
| "start": 4751.84, | |
| "text": "to note here is that this the mapping" | |
| }, | |
| { | |
| "start": 4753.28, | |
| "text": "table here is" | |
| }, | |
| { | |
| "start": 4755.0, | |
| "text": "50257 where does that number come from" | |
| }, | |
| { | |
| "start": 4758.6, | |
| "text": "where what are the tokens as I mentioned" | |
| }, | |
| { | |
| "start": 4760.8, | |
| "text": "there are 256 raw bite token" | |
| }, | |
| { | |
| "start": 4764.4, | |
| "text": "tokens and then opena actually did" | |
| }, | |
| { | |
| "start": 4767.199, | |
| "text": "50,000" | |
| }, | |
| { | |
| "start": 4768.639, | |
| "text": "merges so those become the other tokens" | |
| }, | |
| { | |
| "start": 4772.0, | |
| "text": "but this would have been" | |
| }, | |
| { | |
| "start": 4774.04, | |
| "text": "50256 so what is the 57th token and" | |
| }, | |
| { | |
| "start": 4777.679, | |
| "text": "there is basically one special" | |
| }, | |
| { | |
| "start": 4780.52, | |
| "text": "token and that one special token you can" | |
| }, | |
| { | |
| "start": 4783.239, | |
| "text": "see is called end of text so this is a" | |
| }, | |
| { | |
| "start": 4787.04, | |
| "text": "special token and it's the very last" | |
| }, | |
| { | |
| "start": 4789.56, | |
| "text": "token and this token is used to delimit" | |
| }, | |
| { | |
| "start": 4792.48, | |
| "text": "documents ments in the training set so" | |
| }, | |
| { | |
| "start": 4795.76, | |
| "text": "when we're creating the training data we" | |
| }, | |
| { | |
| "start": 4797.32, | |
| "text": "have all these documents and we tokenize" | |
| }, | |
| { | |
| "start": 4799.199, | |
| "text": "them and we get a stream of tokens those" | |
| }, | |
| { | |
| "start": 4801.8, | |
| "text": "tokens only range from Z to" | |
| }, | |
| { | |
| "start": 4805.28, | |
| "text": "50256 and then in between those" | |
| }, | |
| { | |
| "start": 4807.4, | |
| "text": "documents we put special end of text" | |
| }, | |
| { | |
| "start": 4810.4, | |
| "text": "token and we insert that token in" | |
| }, | |
| { | |
| "start": 4812.8, | |
| "text": "between documents and we are using this" | |
| }, | |
| { | |
| "start": 4815.639, | |
| "text": "as a signal to the language model that" | |
| }, | |
| { | |
| "start": 4818.4, | |
| "text": "the document has ended and what follows" | |
| }, | |
| { | |
| "start": 4820.719, | |
| "text": "is going to be unrelated to the document" | |
| }, | |
| { | |
| "start": 4823.28, | |
| "text": "previously that said the language model" | |
| }, | |
| { | |
| "start": 4825.199, | |
| "text": "has to learn this from data it it needs" | |
| }, | |
| { | |
| "start": 4827.199, | |
| "text": "to learn that this token usually means" | |
| }, | |
| { | |
| "start": 4829.719, | |
| "text": "that it should wipe its sort of memory" | |
| }, | |
| { | |
| "start": 4831.92, | |
| "text": "of what came before and what came before" | |
| }, | |
| { | |
| "start": 4834.04, | |
| "text": "this token is not actually informative" | |
| }, | |
| { | |
| "start": 4835.56, | |
| "text": "to what comes next but we are expecting" | |
| }, | |
| { | |
| "start": 4837.56, | |
| "text": "the language model to just like learn" | |
| }, | |
| { | |
| "start": 4839.0, | |
| "text": "this but we're giving it the Special" | |
| }, | |
| { | |
| "start": 4840.92, | |
| "text": "sort of the limiter of these documents" | |
| }, | |
| { | |
| "start": 4844.08, | |
| "text": "we can go here to Tech tokenizer and um" | |
| }, | |
| { | |
| "start": 4846.679, | |
| "text": "this the gpt2 tokenizer uh our code that" | |
| }, | |
| { | |
| "start": 4849.48, | |
| "text": "we've been playing with before so we can" | |
| }, | |
| { | |
| "start": 4851.44, | |
| "text": "add here right hello world world how are" | |
| }, | |
| { | |
| "start": 4853.679, | |
| "text": "you and we're getting different tokens" | |
| }, | |
| { | |
| "start": 4855.84, | |
| "text": "but now you can see what if what happens" | |
| }, | |
| { | |
| "start": 4858.239, | |
| "text": "if I put end of text you see how until I" | |
| }, | |
| { | |
| "start": 4862.199, | |
| "text": "finished it these are all different" | |
| }, | |
| { | |
| "start": 4863.92, | |
| "text": "tokens end of" | |
| }, | |
| { | |
| "start": 4866.36, | |
| "text": "text still set different tokens and now" | |
| }, | |
| { | |
| "start": 4868.8, | |
| "text": "when I finish it suddenly we get token" | |
| }, | |
| { | |
| "start": 4873.28, | |
| "text": "50256 and the reason this works is" | |
| }, | |
| { | |
| "start": 4875.88, | |
| "text": "because this didn't actually go through" | |
| }, | |
| { | |
| "start": 4878.239, | |
| "text": "the bpe merges instead the code that" | |
| }, | |
| { | |
| "start": 4881.92, | |
| "text": "actually outposted tokens has special" | |
| }, | |
| { | |
| "start": 4885.0, | |
| "text": "case instructions for handling special" | |
| }, | |
| { | |
| "start": 4888.04, | |
| "text": "tokens um we did not see these special" | |
| }, | |
| { | |
| "start": 4890.76, | |
| "text": "instructions for handling special tokens" | |
| }, | |
| { | |
| "start": 4892.84, | |
| "text": "in the encoder dopy it's absent there" | |
| }, | |
| { | |
| "start": 4896.36, | |
| "text": "but if you go to Tech token Library" | |
| }, | |
| { | |
| "start": 4898.0, | |
| "text": "which is uh implemented in Rust you will" | |
| }, | |
| { | |
| "start": 4900.92, | |
| "text": "find all kinds of special case handling" | |
| }, | |
| { | |
| "start": 4902.639, | |
| "text": "for these special tokens that you can" | |
| }, | |
| { | |
| "start": 4904.52, | |
| "text": "register uh create adds to the" | |
| }, | |
| { | |
| "start": 4907.12, | |
| "text": "vocabulary and then it looks for them" | |
| }, | |
| { | |
| "start": 4909.0, | |
| "text": "and it uh whenever it sees these special" | |
| }, | |
| { | |
| "start": 4910.92, | |
| "text": "tokens like this it will actually come" | |
| }, | |
| { | |
| "start": 4913.44, | |
| "text": "in and swap in that special token so" | |
| }, | |
| { | |
| "start": 4916.08, | |
| "text": "these things are outside of the typical" | |
| }, | |
| { | |
| "start": 4918.12, | |
| "text": "algorithm of uh B PA en" | |
| }, | |
| { | |
| "start": 4920.56, | |
| "text": "coding so these special tokens are used" | |
| }, | |
| { | |
| "start": 4922.92, | |
| "text": "pervasively uh not just in uh basically" | |
| }, | |
| { | |
| "start": 4925.639, | |
| "text": "base language modeling of predicting the" | |
| }, | |
| { | |
| "start": 4927.4, | |
| "text": "next token in the sequence but" | |
| }, | |
| { | |
| "start": 4929.08, | |
| "text": "especially when it gets to later to the" | |
| }, | |
| { | |
| "start": 4930.679, | |
| "text": "fine tuning stage and all of the chat uh" | |
| }, | |
| { | |
| "start": 4933.239, | |
| "text": "gbt sort of aspects of it uh because we" | |
| }, | |
| { | |
| "start": 4935.679, | |
| "text": "don't just want to Del limit documents" | |
| }, | |
| { | |
| "start": 4936.88, | |
| "text": "we want to delimit entire conversations" | |
| }, | |
| { | |
| "start": 4938.719, | |
| "text": "between an assistant and a user so if I" | |
| }, | |
| { | |
| "start": 4941.56, | |
| "text": "refresh this sck tokenizer page the" | |
| }, | |
| { | |
| "start": 4944.239, | |
| "text": "default example that they have here is" | |
| }, | |
| { | |
| "start": 4946.44, | |
| "text": "using not sort of base model encoders" | |
| }, | |
| { | |
| "start": 4950.12, | |
| "text": "but ftuned model uh sort of tokenizers" | |
| }, | |
| { | |
| "start": 4953.6, | |
| "text": "um so for example using the GPT 3.5" | |
| }, | |
| { | |
| "start": 4955.84, | |
| "text": "turbo scheme these here are all special" | |
| }, | |
| { | |
| "start": 4958.96, | |
| "text": "tokens I am start I end Etc uh this is" | |
| }, | |
| { | |
| "start": 4963.239, | |
| "text": "short for Imaginary mcore start by the" | |
| }, | |
| { | |
| "start": 4966.84, | |
| "text": "way but you can see here that there's a" | |
| }, | |
| { | |
| "start": 4969.6, | |
| "text": "sort of start and end of every single" | |
| }, | |
| { | |
| "start": 4971.199, | |
| "text": "message and there can be many other" | |
| }, | |
| { | |
| "start": 4972.56, | |
| "text": "other tokens lots of tokens um in use to" | |
| }, | |
| { | |
| "start": 4976.52, | |
| "text": "delimit these conversations and kind of" | |
| }, | |
| { | |
| "start": 4978.719, | |
| "text": "keep track of the flow of the messages" | |
| }, | |
| { | |
| "start": 4980.84, | |
| "text": "here now we can go back to the Tik token" | |
| }, | |
| { | |
| "start": 4983.8, | |
| "text": "library and here when you scroll to the" | |
| }, | |
| { | |
| "start": 4986.239, | |
| "text": "bottom they talk about how you can" | |
| }, | |
| { | |
| "start": 4988.159, | |
| "text": "extend tick token and I can you can" | |
| }, | |
| { | |
| "start": 4990.239, | |
| "text": "create basically you can Fork uh the um" | |
| }, | |
| { | |
| "start": 4993.679, | |
| "text": "CL 100K base tokenizers in gp4 and for" | |
| }, | |
| { | |
| "start": 4997.32, | |
| "text": "example you can extend it by adding more" | |
| }, | |
| { | |
| "start": 4998.92, | |
| "text": "special tokens and these are totally up" | |
| }, | |
| { | |
| "start": 5000.36, | |
| "text": "to you you can come up with any" | |
| }, | |
| { | |
| "start": 5001.36, | |
| "text": "arbitrary tokens and add them with the" | |
| }, | |
| { | |
| "start": 5003.76, | |
| "text": "new ID afterwards and the tikken library" | |
| }, | |
| { | |
| "start": 5006.52, | |
| "text": "will uh correctly swap them out uh when" | |
| }, | |
| { | |
| "start": 5009.88, | |
| "text": "it sees this in the" | |
| }, | |
| { | |
| "start": 5011.76, | |
| "text": "strings now we can also go back to this" | |
| }, | |
| { | |
| "start": 5014.96, | |
| "text": "file which we've looked at previously" | |
| }, | |
| { | |
| "start": 5017.08, | |
| "text": "and I mentioned that the gpt2 in Tik" | |
| }, | |
| { | |
| "start": 5019.679, | |
| "text": "toen open" | |
| }, | |
| { | |
| "start": 5021.44, | |
| "text": "I.P we have the vocabulary we have the" | |
| }, | |
| { | |
| "start": 5024.0, | |
| "text": "pattern for splitting and then here we" | |
| }, | |
| { | |
| "start": 5026.28, | |
| "text": "are registering the single special token" | |
| }, | |
| { | |
| "start": 5028.04, | |
| "text": "in gpd2 which was the end of text token" | |
| }, | |
| { | |
| "start": 5030.32, | |
| "text": "and we saw that it has this ID" | |
| }, | |
| { | |
| "start": 5033.0, | |
| "text": "in GPT 4 when they defy this here you" | |
| }, | |
| { | |
| "start": 5036.4, | |
| "text": "see that the pattern has changed as" | |
| }, | |
| { | |
| "start": 5037.6, | |
| "text": "we've discussed but also the special" | |
| }, | |
| { | |
| "start": 5039.36, | |
| "text": "tokens have changed in this tokenizer so" | |
| }, | |
| { | |
| "start": 5041.8, | |
| "text": "we of course have the end of text just" | |
| }, | |
| { | |
| "start": 5043.719, | |
| "text": "like in gpd2 but we also see three sorry" | |
| }, | |
| { | |
| "start": 5046.88, | |
| "text": "four additional tokens here Thim prefix" | |
| }, | |
| { | |
| "start": 5049.52, | |
| "text": "middle and suffix what is fim fim is" | |
| }, | |
| { | |
| "start": 5052.36, | |
| "text": "short for fill in the middle and if" | |
| }, | |
| { | |
| "start": 5054.88, | |
| "text": "you'd like to learn more about this idea" | |
| }, | |
| { | |
| "start": 5057.0, | |
| "text": "it comes from this paper um and I'm not" | |
| }, | |
| { | |
| "start": 5060.0, | |
| "text": "going to go into detail in this video" | |
| }, | |
| { | |
| "start": 5061.199, | |
| "text": "it's beyond this video and then there's" | |
| }, | |
| { | |
| "start": 5063.44, | |
| "text": "one additional uh serve token here so" | |
| }, | |
| { | |
| "start": 5067.04, | |
| "text": "that's that encoding as well so it's" | |
| }, | |
| { | |
| "start": 5069.92, | |
| "text": "very common basically to train a" | |
| }, | |
| { | |
| "start": 5071.6, | |
| "text": "language model and then if you'd like uh" | |
| }, | |
| { | |
| "start": 5074.719, | |
| "text": "you can add special tokens now when you" | |
| }, | |
| { | |
| "start": 5077.52, | |
| "text": "add special tokens you of course have to" | |
| }, | |
| { | |
| "start": 5079.8, | |
| "text": "um do some model surgery to the" | |
| }, | |
| { | |
| "start": 5081.719, | |
| "text": "Transformer and all the parameters" | |
| }, | |
| { | |
| "start": 5083.44, | |
| "text": "involved in that Transformer because you" | |
| }, | |
| { | |
| "start": 5085.159, | |
| "text": "are basically adding an integer and you" | |
| }, | |
| { | |
| "start": 5087.119, | |
| "text": "want to make sure that for example your" | |
| }, | |
| { | |
| "start": 5088.56, | |
| "text": "embedding Matrix for the vocabulary" | |
| }, | |
| { | |
| "start": 5090.639, | |
| "text": "tokens has to be extended by adding a" | |
| }, | |
| { | |
| "start": 5093.04, | |
| "text": "row and typically this row would be" | |
| }, | |
| { | |
| "start": 5094.88, | |
| "text": "initialized uh with small random numbers" | |
| }, | |
| { | |
| "start": 5096.88, | |
| "text": "or something like that because we need" | |
| }, | |
| { | |
| "start": 5098.8, | |
| "text": "to have a vector that now stands for" | |
| }, | |
| { | |
| "start": 5101.199, | |
| "text": "that token in addition to that you have" | |
| }, | |
| { | |
| "start": 5103.28, | |
| "text": "to go to the final layer of the" | |
| }, | |
| { | |
| "start": 5104.28, | |
| "text": "Transformer and you have to make sure" | |
| }, | |
| { | |
| "start": 5105.679, | |
| "text": "that that projection at the very end" | |
| }, | |
| { | |
| "start": 5107.52, | |
| "text": "into the classifier uh is extended by" | |
| }, | |
| { | |
| "start": 5109.679, | |
| "text": "one as well so basically there's some" | |
| }, | |
| { | |
| "start": 5111.8, | |
| "text": "model surgery involved that you have to" | |
| }, | |
| { | |
| "start": 5113.48, | |
| "text": "couple with the tokenization changes if" | |
| }, | |
| { | |
| "start": 5116.52, | |
| "text": "you are going to add special tokens but" | |
| }, | |
| { | |
| "start": 5118.92, | |
| "text": "this is a very common operation that" | |
| }, | |
| { | |
| "start": 5120.199, | |
| "text": "people do especially if they'd like to" | |
| }, | |
| { | |
| "start": 5121.8, | |
| "text": "fine tune the model for example taking" | |
| }, | |
| { | |
| "start": 5123.719, | |
| "text": "it from a base model to a chat model" | |
| }, | |
| { | |
| "start": 5126.239, | |
| "text": "like chat" | |
| }, | |
| { | |
| "start": 5127.88, | |
| "text": "GPT okay so at this point you should" | |
| }, | |
| { | |
| "start": 5129.84, | |
| "text": "have everything you need in order to" | |
| }, | |
| { | |
| "start": 5131.04, | |
| "text": "build your own gp4 tokenizer now in the" | |
| }, | |
| { | |
| "start": 5133.719, | |
| "text": "process of developing this lecture I've" | |
| }, | |
| { | |
| "start": 5135.36, | |
| "text": "done that and I published the code under" | |
| }, | |
| { | |
| "start": 5137.239, | |
| "text": "this repository" | |
| }, | |
| { | |
| "start": 5138.92, | |
| "text": "MBP so MBP looks like this right now as" | |
| }, | |
| { | |
| "start": 5142.52, | |
| "text": "I'm recording but uh the MBP repository" | |
| }, | |
| { | |
| "start": 5145.36, | |
| "text": "will probably change quite a bit because" | |
| }, | |
| { | |
| "start": 5146.719, | |
| "text": "I intend to continue working on it um in" | |
| }, | |
| { | |
| "start": 5149.84, | |
| "text": "addition to the MBP repository I've" | |
| }, | |
| { | |
| "start": 5151.76, | |
| "text": "published the this uh exercise" | |
| }, | |
| { | |
| "start": 5153.44, | |
| "text": "progression that you can follow so if" | |
| }, | |
| { | |
| "start": 5155.36, | |
| "text": "you go to exercise. MD here uh this is" | |
| }, | |
| { | |
| "start": 5158.36, | |
| "text": "sort of me breaking up the task ahead of" | |
| }, | |
| { | |
| "start": 5161.159, | |
| "text": "you into four steps that sort of uh" | |
| }, | |
| { | |
| "start": 5163.4, | |
| "text": "build up to what can be a gp4 tokenizer" | |
| }, | |
| { | |
| "start": 5166.639, | |
| "text": "and so feel free to follow these steps" | |
| }, | |
| { | |
| "start": 5168.4, | |
| "text": "exactly and follow a little bit of the" | |
| }, | |
| { | |
| "start": 5170.4, | |
| "text": "guidance that I've laid out here and" | |
| }, | |
| { | |
| "start": 5172.48, | |
| "text": "anytime you feel stuck just reference" | |
| }, | |
| { | |
| "start": 5174.639, | |
| "text": "the MBP repository here so either the" | |
| }, | |
| { | |
| "start": 5177.96, | |
| "text": "tests could be useful or the MBP" | |
| }, | |
| { | |
| "start": 5180.08, | |
| "text": "repository itself I try to keep the code" | |
| }, | |
| { | |
| "start": 5182.6, | |
| "text": "fairly clean and understandable and so" | |
| }, | |
| { | |
| "start": 5186.159, | |
| "text": "um feel free to reference it whenever um" | |
| }, | |
| { | |
| "start": 5188.92, | |
| "text": "you get" | |
| }, | |
| { | |
| "start": 5190.159, | |
| "text": "stuck uh in addition to that basically" | |
| }, | |
| { | |
| "start": 5192.56, | |
| "text": "once you write it you should be able to" | |
| }, | |
| { | |
| "start": 5194.679, | |
| "text": "reproduce this behavior from Tech token" | |
| }, | |
| { | |
| "start": 5196.84, | |
| "text": "so getting the gb4 tokenizer you can" | |
| }, | |
| { | |
| "start": 5199.32, | |
| "text": "take uh you can encode the string and" | |
| }, | |
| { | |
| "start": 5201.32, | |
| "text": "you should get these tokens and then you" | |
| }, | |
| { | |
| "start": 5203.239, | |
| "text": "can encode and decode the exact same" | |
| }, | |
| { | |
| "start": 5204.679, | |
| "text": "string to recover it and in addition to" | |
| }, | |
| { | |
| "start": 5207.239, | |
| "text": "all that you should be able to implement" | |
| }, | |
| { | |
| "start": 5208.4, | |
| "text": "your own train function uh which Tik" | |
| }, | |
| { | |
| "start": 5210.719, | |
| "text": "token Library does not provide it's it's" | |
| }, | |
| { | |
| "start": 5212.48, | |
| "text": "again only inference code but you could" | |
| }, | |
| { | |
| "start": 5214.6, | |
| "text": "write your own train MBP does it as well" | |
| }, | |
| { | |
| "start": 5217.88, | |
| "text": "and that will allow you to train your" | |
| }, | |
| { | |
| "start": 5219.32, | |
| "text": "own token" | |
| }, | |
| { | |
| "start": 5220.719, | |
| "text": "vocabularies so here are some of the" | |
| }, | |
| { | |
| "start": 5222.4, | |
| "text": "code inside M be mean bpe uh shows the" | |
| }, | |
| { | |
| "start": 5226.04, | |
| "text": "token vocabularies that you might obtain" | |
| }, | |
| { | |
| "start": 5228.719, | |
| "text": "so on the left uh here we have the GPT 4" | |
| }, | |
| { | |
| "start": 5232.4, | |
| "text": "merges uh so the first 256 are raw" | |
| }, | |
| { | |
| "start": 5235.84, | |
| "text": "individual bytes and then here I am" | |
| }, | |
| { | |
| "start": 5237.719, | |
| "text": "visualizing the merges that gp4" | |
| }, | |
| { | |
| "start": 5239.56, | |
| "text": "performed during its training so the" | |
| }, | |
| { | |
| "start": 5241.76, | |
| "text": "very first merge that gp4 did was merge" | |
| }, | |
| { | |
| "start": 5244.92, | |
| "text": "two spaces into a single token for you" | |
| }, | |
| { | |
| "start": 5247.6, | |
| "text": "know two spaces and that is a token 256" | |
| }, | |
| { | |
| "start": 5250.84, | |
| "text": "and so this is the order in which things" | |
| }, | |
| { | |
| "start": 5252.239, | |
| "text": "merged during gb4 training and this is" | |
| }, | |
| { | |
| "start": 5254.679, | |
| "text": "the merge order that um we obtain in MBP" | |
| }, | |
| { | |
| "start": 5259.08, | |
| "text": "by training a tokenizer and in this case" | |
| }, | |
| { | |
| "start": 5261.199, | |
| "text": "I trained it on a Wikipedia page of" | |
| }, | |
| { | |
| "start": 5263.239, | |
| "text": "Taylor Swift uh not because I'm a Swifty" | |
| }, | |
| { | |
| "start": 5265.6, | |
| "text": "but because that is one of the longest" | |
| }, | |
| { | |
| "start": 5267.8, | |
| "text": "um Wikipedia Pages apparently that's" | |
| }, | |
| { | |
| "start": 5269.639, | |
| "text": "available but she is pretty cool and" | |
| }, | |
| { | |
| "start": 5274.04, | |
| "text": "um what was I going to say yeah so you" | |
| }, | |
| { | |
| "start": 5276.639, | |
| "text": "can compare these two uh vocabularies" | |
| }, | |
| { | |
| "start": 5279.08, | |
| "text": "and so as an example um here GPT for" | |
| }, | |
| { | |
| "start": 5284.0, | |
| "text": "merged I in to become in and we've done" | |
| }, | |
| { | |
| "start": 5286.8, | |
| "text": "the exact same thing on this token 259" | |
| }, | |
| { | |
| "start": 5290.0, | |
| "text": "here space t becomes space t and that" | |
| }, | |
| { | |
| "start": 5293.28, | |
| "text": "happened for us a little bit later as" | |
| }, | |
| { | |
| "start": 5294.639, | |
| "text": "well so the difference here is again to" | |
| }, | |
| { | |
| "start": 5296.719, | |
| "text": "my understanding only a difference of" | |
| }, | |
| { | |
| "start": 5298.4, | |
| "text": "the training set so as an example" | |
| }, | |
| { | |
| "start": 5300.28, | |
| "text": "because I see a lot of white space I" | |
| }, | |
| { | |
| "start": 5302.08, | |
| "text": "supect that gp4 probably had a lot of" | |
| }, | |
| { | |
| "start": 5303.76, | |
| "text": "python code in its training set I'm not" | |
| }, | |
| { | |
| "start": 5305.48, | |
| "text": "sure uh for the" | |
| }, | |
| { | |
| "start": 5307.6, | |
| "text": "tokenizer and uh here we see much less" | |
| }, | |
| { | |
| "start": 5310.08, | |
| "text": "of that of course in the Wikipedia page" | |
| }, | |
| { | |
| "start": 5312.96, | |
| "text": "so roughly speaking they look the same" | |
| }, | |
| { | |
| "start": 5314.679, | |
| "text": "and they look the same because they're" | |
| }, | |
| { | |
| "start": 5315.96, | |
| "text": "running the same algorithm and when you" | |
| }, | |
| { | |
| "start": 5318.08, | |
| "text": "train your own you're probably going to" | |
| }, | |
| { | |
| "start": 5319.199, | |
| "text": "get something similar depending on what" | |
| }, | |
| { | |
| "start": 5321.199, | |
| "text": "you train it on okay so we are now going" | |
| }, | |
| { | |
| "start": 5323.28, | |
| "text": "to move on from tick token and the way" | |
| }, | |
| { | |
| "start": 5325.08, | |
| "text": "that open AI tokenizes its strings and" | |
| }, | |
| { | |
| "start": 5327.6, | |
| "text": "we're going to discuss one more very" | |
| }, | |
| { | |
| "start": 5329.199, | |
| "text": "commonly used library for working with" | |
| }, | |
| { | |
| "start": 5331.0, | |
| "text": "tokenization inlm" | |
| }, | |
| { | |
| "start": 5332.719, | |
| "text": "and that is sentence piece so sentence" | |
| }, | |
| { | |
| "start": 5335.36, | |
| "text": "piece is very commonly used in language" | |
| }, | |
| { | |
| "start": 5338.159, | |
| "text": "models because unlike Tik token it can" | |
| }, | |
| { | |
| "start": 5340.119, | |
| "text": "do both training and inference and is" | |
| }, | |
| { | |
| "start": 5342.36, | |
| "text": "quite efficient at both it supports a" | |
| }, | |
| { | |
| "start": 5344.84, | |
| "text": "number of algorithms for training uh" | |
| }, | |
| { | |
| "start": 5346.76, | |
| "text": "vocabularies but one of them is the B" | |
| }, | |
| { | |
| "start": 5349.199, | |
| "text": "pair en coding algorithm that we've been" | |
| }, | |
| { | |
| "start": 5350.44, | |
| "text": "looking at so it supports it now" | |
| }, | |
| { | |
| "start": 5353.639, | |
| "text": "sentence piece is used both by llama and" | |
| }, | |
| { | |
| "start": 5355.719, | |
| "text": "mistal series and many other models as" | |
| }, | |
| { | |
| "start": 5358.199, | |
| "text": "well it is on GitHub under Google" | |
| }, | |
| { | |
| "start": 5360.76, | |
| "text": "sentence piece" | |
| }, | |
| { | |
| "start": 5362.76, | |
| "text": "and the big difference with sentence" | |
| }, | |
| { | |
| "start": 5364.4, | |
| "text": "piece and we're going to look at example" | |
| }, | |
| { | |
| "start": 5366.199, | |
| "text": "because this is kind of hard and subtle" | |
| }, | |
| { | |
| "start": 5367.92, | |
| "text": "to explain is that they think different" | |
| }, | |
| { | |
| "start": 5371.04, | |
| "text": "about the order of operations here so in" | |
| }, | |
| { | |
| "start": 5375.48, | |
| "text": "the case of Tik token we first take our" | |
| }, | |
| { | |
| "start": 5378.56, | |
| "text": "code points in the string we encode them" | |
| }, | |
| { | |
| "start": 5381.0, | |
| "text": "using mutf to bytes and then we're" | |
| }, | |
| { | |
| "start": 5382.88, | |
| "text": "merging bytes it's fairly" | |
| }, | |
| { | |
| "start": 5384.96, | |
| "text": "straightforward for sentence piece um it" | |
| }, | |
| { | |
| "start": 5388.88, | |
| "text": "works directly on the level of the code" | |
| }, | |
| { | |
| "start": 5390.4, | |
| "text": "points themselves so so it looks at" | |
| }, | |
| { | |
| "start": 5392.52, | |
| "text": "whatever code points are available in" | |
| }, | |
| { | |
| "start": 5393.92, | |
| "text": "your training set and then it starts" | |
| }, | |
| { | |
| "start": 5395.88, | |
| "text": "merging those code points and um the bpe" | |
| }, | |
| { | |
| "start": 5399.76, | |
| "text": "is running on the level of code" | |
| }, | |
| { | |
| "start": 5401.6, | |
| "text": "points and if you happen to run out of" | |
| }, | |
| { | |
| "start": 5404.239, | |
| "text": "code points so there are maybe some rare" | |
| }, | |
| { | |
| "start": 5406.76, | |
| "text": "uh code points that just don't come up" | |
| }, | |
| { | |
| "start": 5408.04, | |
| "text": "too often and the Rarity is determined" | |
| }, | |
| { | |
| "start": 5409.719, | |
| "text": "by this character coverage hyper" | |
| }, | |
| { | |
| "start": 5411.199, | |
| "text": "parameter then these uh code points will" | |
| }, | |
| { | |
| "start": 5414.36, | |
| "text": "either get mapped to a special unknown" | |
| }, | |
| { | |
| "start": 5416.28, | |
| "text": "token like ank or if you have the bite" | |
| }, | |
| { | |
| "start": 5419.52, | |
| "text": "foldback option turned on then that will" | |
| }, | |
| { | |
| "start": 5422.119, | |
| "text": "take those rare Cod points it will" | |
| }, | |
| { | |
| "start": 5423.96, | |
| "text": "encode them using utf8 and then the" | |
| }, | |
| { | |
| "start": 5426.08, | |
| "text": "individual bytes of that encoding will" | |
| }, | |
| { | |
| "start": 5427.76, | |
| "text": "be translated into tokens and there are" | |
| }, | |
| { | |
| "start": 5430.119, | |
| "text": "these special bite tokens that basically" | |
| }, | |
| { | |
| "start": 5432.199, | |
| "text": "get added to the vocabulary so it uses" | |
| }, | |
| { | |
| "start": 5435.52, | |
| "text": "BP on on the code points and then it" | |
| }, | |
| { | |
| "start": 5438.239, | |
| "text": "falls back to bytes for rare Cod points" | |
| }, | |
| { | |
| "start": 5441.8, | |
| "text": "um and so that's kind of like difference" | |
| }, | |
| { | |
| "start": 5444.08, | |
| "text": "personally I find the Tik token we" | |
| }, | |
| { | |
| "start": 5445.52, | |
| "text": "significantly cleaner uh but it's kind" | |
| }, | |
| { | |
| "start": 5447.48, | |
| "text": "of like a subtle but pretty major" | |
| }, | |
| { | |
| "start": 5448.84, | |
| "text": "difference between the way they approach" | |
| }, | |
| { | |
| "start": 5450.32, | |
| "text": "tokenization let's work with with a" | |
| }, | |
| { | |
| "start": 5452.04, | |
| "text": "concrete example because otherwise this" | |
| }, | |
| { | |
| "start": 5454.0, | |
| "text": "is kind of hard to um to get your head" | |
| }, | |
| { | |
| "start": 5456.719, | |
| "text": "around so let's work with a concrete" | |
| }, | |
| { | |
| "start": 5459.119, | |
| "text": "example this is how we can import" | |
| }, | |
| { | |
| "start": 5461.119, | |
| "text": "sentence piece and then here we're going" | |
| }, | |
| { | |
| "start": 5463.6, | |
| "text": "to take I think I took like the" | |
| }, | |
| { | |
| "start": 5465.199, | |
| "text": "description of sentence piece and I just" | |
| }, | |
| { | |
| "start": 5466.76, | |
| "text": "created like a little toy data set it" | |
| }, | |
| { | |
| "start": 5468.679, | |
| "text": "really likes to have a file so I created" | |
| }, | |
| { | |
| "start": 5470.4, | |
| "text": "a toy. txt file with this" | |
| }, | |
| { | |
| "start": 5473.08, | |
| "text": "content now what's kind of a little bit" | |
| }, | |
| { | |
| "start": 5475.52, | |
| "text": "crazy about sentence piece is that" | |
| }, | |
| { | |
| "start": 5476.76, | |
| "text": "there's a ton of options and" | |
| }, | |
| { | |
| "start": 5478.679, | |
| "text": "configurations and the reason this is so" | |
| }, | |
| { | |
| "start": 5480.8, | |
| "text": "is because sentence piece has been" | |
| }, | |
| { | |
| "start": 5482.199, | |
| "text": "around I think for a while and it really" | |
| }, | |
| { | |
| "start": 5483.84, | |
| "text": "tries to handle a large diversity of" | |
| }, | |
| { | |
| "start": 5485.76, | |
| "text": "things and um because it's been around I" | |
| }, | |
| { | |
| "start": 5488.44, | |
| "text": "think it has quite a bit of accumulated" | |
| }, | |
| { | |
| "start": 5490.52, | |
| "text": "historical baggage uh as well and so in" | |
| }, | |
| { | |
| "start": 5493.679, | |
| "text": "particular there's like a ton of" | |
| }, | |
| { | |
| "start": 5495.56, | |
| "text": "configuration arguments this is not even" | |
| }, | |
| { | |
| "start": 5496.96, | |
| "text": "all of it you can go to here to see all" | |
| }, | |
| { | |
| "start": 5499.8, | |
| "text": "the training" | |
| }, | |
| { | |
| "start": 5500.96, | |
| "text": "options um and uh there's also quite" | |
| }, | |
| { | |
| "start": 5504.4, | |
| "text": "useful documentation when you look at" | |
| }, | |
| { | |
| "start": 5505.719, | |
| "text": "the raw Proto buff uh that is used to" | |
| }, | |
| { | |
| "start": 5508.6, | |
| "text": "represent the trainer spec and so on um" | |
| }, | |
| { | |
| "start": 5512.44, | |
| "text": "many of these options are irrelevant to" | |
| }, | |
| { | |
| "start": 5514.52, | |
| "text": "us so maybe to point out one example Das" | |
| }, | |
| { | |
| "start": 5516.96, | |
| "text": "Das shrinking Factor uh this shrinking" | |
| }, | |
| { | |
| "start": 5519.84, | |
| "text": "factor is not used in the B pair en" | |
| }, | |
| { | |
| "start": 5521.28, | |
| "text": "coding algorithm so this is just an" | |
| }, | |
| { | |
| "start": 5523.159, | |
| "text": "argument that is irrelevant to us um it" | |
| }, | |
| { | |
| "start": 5525.92, | |
| "text": "applies to a different training" | |
| }, | |
| { | |
| "start": 5529.52, | |
| "text": "algorithm now what I tried to do here is" | |
| }, | |
| { | |
| "start": 5531.92, | |
| "text": "I tried to set up sentence piece in a" | |
| }, | |
| { | |
| "start": 5533.88, | |
| "text": "way that is very very similar as far as" | |
| }, | |
| { | |
| "start": 5535.719, | |
| "text": "I can tell to maybe identical hopefully" | |
| }, | |
| { | |
| "start": 5538.88, | |
| "text": "to the way that llama 2 was strained so" | |
| }, | |
| { | |
| "start": 5542.08, | |
| "text": "the way they trained their own um their" | |
| }, | |
| { | |
| "start": 5545.04, | |
| "text": "own tokenizer and the way I did this was" | |
| }, | |
| { | |
| "start": 5547.119, | |
| "text": "basically you can take the tokenizer" | |
| }, | |
| { | |
| "start": 5548.719, | |
| "text": "model file that meta released and you" | |
| }, | |
| { | |
| "start": 5551.4, | |
| "text": "can um open it using the Proto protuff" | |
| }, | |
| { | |
| "start": 5555.199, | |
| "text": "uh sort of file that you can generate" | |
| }, | |
| { | |
| "start": 5558.36, | |
| "text": "and then you can inspect all the options" | |
| }, | |
| { | |
| "start": 5559.719, | |
| "text": "and I tried to copy over all the options" | |
| }, | |
| { | |
| "start": 5561.36, | |
| "text": "that looked relevant so here we set up" | |
| }, | |
| { | |
| "start": 5563.679, | |
| "text": "the input it's raw text in this file" | |
| }, | |
| { | |
| "start": 5566.6, | |
| "text": "here's going to be the output so it's" | |
| }, | |
| { | |
| "start": 5568.08, | |
| "text": "going to be for talk 400. model and" | |
| }, | |
| { | |
| "start": 5570.76, | |
| "text": "vocab" | |
| }, | |
| { | |
| "start": 5572.44, | |
| "text": "we're saying that we're going to use the" | |
| }, | |
| { | |
| "start": 5573.4, | |
| "text": "BP algorithm and we want to Bap size of" | |
| }, | |
| { | |
| "start": 5576.04, | |
| "text": "400 then there's a ton of configurations" | |
| }, | |
| { | |
| "start": 5578.6, | |
| "text": "here" | |
| }, | |
| { | |
| "start": 5581.08, | |
| "text": "for um for basically pre-processing and" | |
| }, | |
| { | |
| "start": 5585.08, | |
| "text": "normalization rules as they're called" | |
| }, | |
| { | |
| "start": 5587.08, | |
| "text": "normalization used to be very prevalent" | |
| }, | |
| { | |
| "start": 5589.48, | |
| "text": "I would say before llms in natural" | |
| }, | |
| { | |
| "start": 5591.159, | |
| "text": "language processing so in machine" | |
| }, | |
| { | |
| "start": 5592.8, | |
| "text": "translation and uh text classification" | |
| }, | |
| { | |
| "start": 5594.88, | |
| "text": "and so on you want to normalize and" | |
| }, | |
| { | |
| "start": 5596.719, | |
| "text": "simplify the text and you want to turn" | |
| }, | |
| { | |
| "start": 5598.0, | |
| "text": "it all lowercase and you want to remove" | |
| }, | |
| { | |
| "start": 5599.52, | |
| "text": "all double whites space Etc" | |
| }, | |
| { | |
| "start": 5602.199, | |
| "text": "and in language models we prefer not to" | |
| }, | |
| { | |
| "start": 5603.76, | |
| "text": "do any of it or at least that is my" | |
| }, | |
| { | |
| "start": 5605.28, | |
| "text": "preference as a deep learning person you" | |
| }, | |
| { | |
| "start": 5606.96, | |
| "text": "want to not touch your data you want to" | |
| }, | |
| { | |
| "start": 5608.84, | |
| "text": "keep the raw data as much as possible um" | |
| }, | |
| { | |
| "start": 5611.679, | |
| "text": "in a raw" | |
| }, | |
| { | |
| "start": 5613.119, | |
| "text": "form so you're basically trying to turn" | |
| }, | |
| { | |
| "start": 5615.159, | |
| "text": "off a lot of this if you can the other" | |
| }, | |
| { | |
| "start": 5618.0, | |
| "text": "thing that sentence piece does is that" | |
| }, | |
| { | |
| "start": 5619.52, | |
| "text": "it has this concept of sentences so" | |
| }, | |
| { | |
| "start": 5623.04, | |
| "text": "sentence piece it's back it's kind of" | |
| }, | |
| { | |
| "start": 5625.48, | |
| "text": "like was developed I think early in the" | |
| }, | |
| { | |
| "start": 5626.84, | |
| "text": "days where there was um an idea that" | |
| }, | |
| { | |
| "start": 5630.159, | |
| "text": "they you're training a tokenizer on a" | |
| }, | |
| { | |
| "start": 5631.96, | |
| "text": "bunch of independent sentences so it has" | |
| }, | |
| { | |
| "start": 5634.199, | |
| "text": "a lot of like how many sentences you're" | |
| }, | |
| { | |
| "start": 5636.36, | |
| "text": "going to train on what is the maximum" | |
| }, | |
| { | |
| "start": 5638.0, | |
| "text": "sentence length" | |
| }, | |
| { | |
| "start": 5640.679, | |
| "text": "um shuffling sentences and so for it" | |
| }, | |
| { | |
| "start": 5643.719, | |
| "text": "sentences are kind of like the" | |
| }, | |
| { | |
| "start": 5644.8, | |
| "text": "individual training examples but again" | |
| }, | |
| { | |
| "start": 5646.88, | |
| "text": "in the context of llms I find that this" | |
| }, | |
| { | |
| "start": 5648.719, | |
| "text": "is like a very spous and weird" | |
| }, | |
| { | |
| "start": 5650.44, | |
| "text": "distinction like sentences are just like" | |
| }, | |
| { | |
| "start": 5653.92, | |
| "text": "don't touch the raw data sentences" | |
| }, | |
| { | |
| "start": 5655.6, | |
| "text": "happen to exist but in raw data sets" | |
| }, | |
| { | |
| "start": 5658.679, | |
| "text": "there are a lot of like inet like what" | |
| }, | |
| { | |
| "start": 5660.6, | |
| "text": "exactly is a sentence what isn't a" | |
| }, | |
| { | |
| "start": 5662.44, | |
| "text": "sentence um and so I think like it's" | |
| }, | |
| { | |
| "start": 5665.0, | |
| "text": "really hard to Define what an actual" | |
| }, | |
| { | |
| "start": 5666.48, | |
| "text": "sentence is if you really like dig into" | |
| }, | |
| { | |
| "start": 5668.639, | |
| "text": "it and there could be different concepts" | |
| }, | |
| { | |
| "start": 5670.92, | |
| "text": "of it in different languages or" | |
| }, | |
| { | |
| "start": 5672.119, | |
| "text": "something like that so why even" | |
| }, | |
| { | |
| "start": 5673.719, | |
| "text": "introduce the concept it it doesn't" | |
| }, | |
| { | |
| "start": 5675.56, | |
| "text": "honestly make sense to me I would just" | |
| }, | |
| { | |
| "start": 5676.92, | |
| "text": "prefer to treat a file as a giant uh" | |
| }, | |
| { | |
| "start": 5679.199, | |
| "text": "stream of" | |
| }, | |
| { | |
| "start": 5680.36, | |
| "text": "bytes it has a lot of treatment around" | |
| }, | |
| { | |
| "start": 5682.8, | |
| "text": "rare word characters and when I say word" | |
| }, | |
| { | |
| "start": 5685.119, | |
| "text": "I mean code points we're going to come" | |
| }, | |
| { | |
| "start": 5686.48, | |
| "text": "back to this in a second and it has a" | |
| }, | |
| { | |
| "start": 5688.679, | |
| "text": "lot of other rules for um basically" | |
| }, | |
| { | |
| "start": 5691.679, | |
| "text": "splitting digits splitting white space" | |
| }, | |
| { | |
| "start": 5694.48, | |
| "text": "and numbers and how you deal with that" | |
| }, | |
| { | |
| "start": 5696.56, | |
| "text": "so these are some kind of like merge" | |
| }, | |
| { | |
| "start": 5698.199, | |
| "text": "rules so I think this is a little bit" | |
| }, | |
| { | |
| "start": 5700.08, | |
| "text": "equivalent to tick token using the" | |
| }, | |
| { | |
| "start": 5702.92, | |
| "text": "regular expression to split up" | |
| }, | |
| { | |
| "start": 5704.52, | |
| "text": "categories there's like kind of" | |
| }, | |
| { | |
| "start": 5707.04, | |
| "text": "equivalence of it if you squint T it in" | |
| }, | |
| { | |
| "start": 5709.239, | |
| "text": "sentence piece where you can also for" | |
| }, | |
| { | |
| "start": 5710.639, | |
| "text": "example split up split up the digits uh" | |
| }, | |
| { | |
| "start": 5714.199, | |
| "text": "and uh so" | |
| }, | |
| { | |
| "start": 5715.84, | |
| "text": "on there's a few more things here that" | |
| }, | |
| { | |
| "start": 5718.199, | |
| "text": "I'll come back to in a bit and then" | |
| }, | |
| { | |
| "start": 5719.36, | |
| "text": "there are some special tokens that you" | |
| }, | |
| { | |
| "start": 5720.48, | |
| "text": "can indicate and it hardcodes the UN" | |
| }, | |
| { | |
| "start": 5723.36, | |
| "text": "token the beginning of sentence end of" | |
| }, | |
| { | |
| "start": 5725.56, | |
| "text": "sentence and a pad token um and the UN" | |
| }, | |
| { | |
| "start": 5729.32, | |
| "text": "token must exist for my understanding" | |
| }, | |
| { | |
| "start": 5732.52, | |
| "text": "and then some some things so we can" | |
| }, | |
| { | |
| "start": 5734.719, | |
| "text": "train and when when I press train it's" | |
| }, | |
| { | |
| "start": 5737.28, | |
| "text": "going to create this file talk 400." | |
| }, | |
| { | |
| "start": 5740.119, | |
| "text": "model and talk 400. wab I can then load" | |
| }, | |
| { | |
| "start": 5743.159, | |
| "text": "the model file and I can inspect the" | |
| }, | |
| { | |
| "start": 5745.56, | |
| "text": "vocabulary off it and so we trained" | |
| }, | |
| { | |
| "start": 5748.56, | |
| "text": "vocab size 400 on this text here and" | |
| }, | |
| { | |
| "start": 5753.32, | |
| "text": "these are the individual pieces the" | |
| }, | |
| { | |
| "start": 5755.0, | |
| "text": "individual tokens that sentence piece" | |
| }, | |
| { | |
| "start": 5756.88, | |
| "text": "will create so in the beginning we see" | |
| }, | |
| { | |
| "start": 5758.8, | |
| "text": "that we have the an token uh with the ID" | |
| }, | |
| { | |
| "start": 5762.08, | |
| "text": "zero then we have the beginning of" | |
| }, | |
| { | |
| "start": 5764.04, | |
| "text": "sequence end of sequence one and two and" | |
| }, | |
| { | |
| "start": 5767.8, | |
| "text": "then we said that the pad ID is negative" | |
| }, | |
| { | |
| "start": 5769.32, | |
| "text": "1 so we chose not to use it so there's" | |
| }, | |
| { | |
| "start": 5772.08, | |
| "text": "no pad ID" | |
| }, | |
| { | |
| "start": 5773.48, | |
| "text": "here then these are individual bite" | |
| }, | |
| { | |
| "start": 5776.84, | |
| "text": "tokens so here we saw that bite fallback" | |
| }, | |
| { | |
| "start": 5780.159, | |
| "text": "in llama was turned on so it's true so" | |
| }, | |
| { | |
| "start": 5783.56, | |
| "text": "what follows are going to be the 256" | |
| }, | |
| { | |
| "start": 5786.159, | |
| "text": "bite" | |
| }, | |
| { | |
| "start": 5787.199, | |
| "text": "tokens and these are their" | |
| }, | |
| { | |
| "start": 5791.719, | |
| "text": "IDs and then at the bottom after the" | |
| }, | |
| { | |
| "start": 5795.04, | |
| "text": "bite tokens come the" | |
| }, | |
| { | |
| "start": 5797.679, | |
| "text": "merges and these are the parent nodes in" | |
| }, | |
| { | |
| "start": 5800.56, | |
| "text": "the merges so we're not seeing the" | |
| }, | |
| { | |
| "start": 5802.199, | |
| "text": "children we're just seeing the parents" | |
| }, | |
| { | |
| "start": 5803.719, | |
| "text": "and their" | |
| }, | |
| { | |
| "start": 5804.6, | |
| "text": "ID and then after the" | |
| }, | |
| { | |
| "start": 5807.04, | |
| "text": "merges comes eventually the individual" | |
| }, | |
| { | |
| "start": 5810.719, | |
| "text": "tokens and their IDs and so these are" | |
| }, | |
| { | |
| "start": 5813.56, | |
| "text": "the individual tokens so these are the" | |
| }, | |
| { | |
| "start": 5815.32, | |
| "text": "individual code Point tokens if you will" | |
| }, | |
| { | |
| "start": 5818.239, | |
| "text": "and they come at the end so that is the" | |
| }, | |
| { | |
| "start": 5820.28, | |
| "text": "ordering with which sentence piece sort" | |
| }, | |
| { | |
| "start": 5821.76, | |
| "text": "of like represents its vocabularies it" | |
| }, | |
| { | |
| "start": 5823.92, | |
| "text": "starts with special tokens then the bike" | |
| }, | |
| { | |
| "start": 5826.119, | |
| "text": "tokens then the merge tokens and then" | |
| }, | |
| { | |
| "start": 5828.159, | |
| "text": "the individual codo tokens and all these" | |
| }, | |
| { | |
| "start": 5831.639, | |
| "text": "raw codepoint to tokens are the ones" | |
| }, | |
| { | |
| "start": 5834.04, | |
| "text": "that it encountered in the training" | |
| }, | |
| { | |
| "start": 5836.119, | |
| "text": "set so those individual code points are" | |
| }, | |
| { | |
| "start": 5839.8, | |
| "text": "all the the entire set of code points" | |
| }, | |
| { | |
| "start": 5842.159, | |
| "text": "that occurred" | |
| }, | |
| { | |
| "start": 5844.4, | |
| "text": "here so those all get put in there and" | |
| }, | |
| { | |
| "start": 5847.48, | |
| "text": "then those that are extremely rare as" | |
| }, | |
| { | |
| "start": 5849.28, | |
| "text": "determined by character coverage so if a" | |
| }, | |
| { | |
| "start": 5851.119, | |
| "text": "code Point occurred only a single time" | |
| }, | |
| { | |
| "start": 5852.52, | |
| "text": "out of like a million um sentences or" | |
| }, | |
| { | |
| "start": 5855.159, | |
| "text": "something like that then it would be" | |
| }, | |
| { | |
| "start": 5857.08, | |
| "text": "ignored and it would not be added to our" | |
| }, | |
| { | |
| "start": 5860.199, | |
| "text": "uh" | |
| }, | |
| { | |
| "start": 5861.04, | |
| "text": "vocabulary once we have a vocabulary we" | |
| }, | |
| { | |
| "start": 5863.36, | |
| "text": "can encode into IDs and we can um sort" | |
| }, | |
| { | |
| "start": 5866.48, | |
| "text": "of get a" | |
| }, | |
| { | |
| "start": 5867.4, | |
| "text": "list and then here I am also decoding" | |
| }, | |
| { | |
| "start": 5870.679, | |
| "text": "the indiv idual tokens back into little" | |
| }, | |
| { | |
| "start": 5874.32, | |
| "text": "pieces as they call it so let's take a" | |
| }, | |
| { | |
| "start": 5876.96, | |
| "text": "look at what happened here hello space" | |
| }, | |
| { | |
| "start": 5881.08, | |
| "text": "on so these are the token IDs we got" | |
| }, | |
| { | |
| "start": 5884.679, | |
| "text": "back and when we look here uh a few" | |
| }, | |
| { | |
| "start": 5887.48, | |
| "text": "things sort of uh jump to mind number" | |
| }, | |
| { | |
| "start": 5891.52, | |
| "text": "one take a look at these characters the" | |
| }, | |
| { | |
| "start": 5894.159, | |
| "text": "Korean characters of course were not" | |
| }, | |
| { | |
| "start": 5895.52, | |
| "text": "part of the training set so sentence" | |
| }, | |
| { | |
| "start": 5898.0, | |
| "text": "piece is encountering code points that" | |
| }, | |
| { | |
| "start": 5899.599, | |
| "text": "it has not seen during training time and" | |
| }, | |
| { | |
| "start": 5902.199, | |
| "text": "those code points do not have a token" | |
| }, | |
| { | |
| "start": 5904.56, | |
| "text": "associated with them so suddenly these" | |
| }, | |
| { | |
| "start": 5906.4, | |
| "text": "are un tokens unknown tokens but because" | |
| }, | |
| { | |
| "start": 5910.56, | |
| "text": "bite fall back as true instead sentence" | |
| }, | |
| { | |
| "start": 5913.84, | |
| "text": "piece falls back to bytes and so it" | |
| }, | |
| { | |
| "start": 5916.44, | |
| "text": "takes this it encodes it with utf8 and" | |
| }, | |
| { | |
| "start": 5919.84, | |
| "text": "then it uses these tokens to represent" | |
| }, | |
| { | |
| "start": 5923.28, | |
| "text": "uh those bytes and that's what we are" | |
| }, | |
| { | |
| "start": 5925.8, | |
| "text": "getting sort of here this is the utf8 uh" | |
| }, | |
| { | |
| "start": 5929.719, | |
| "text": "encoding and in this shifted by three uh" | |
| }, | |
| { | |
| "start": 5932.88, | |
| "text": "because of these um special tokens here" | |
| }, | |
| { | |
| "start": 5936.239, | |
| "text": "that have IDs earlier on so that's what" | |
| }, | |
| { | |
| "start": 5938.84, | |
| "text": "happened here now one more thing that um" | |
| }, | |
| { | |
| "start": 5942.92, | |
| "text": "well first before I go on with respect" | |
| }, | |
| { | |
| "start": 5945.52, | |
| "text": "to the bitef back let me remove bite" | |
| }, | |
| { | |
| "start": 5948.239, | |
| "text": "foldback if this is false what's going" | |
| }, | |
| { | |
| "start": 5950.84, | |
| "text": "to happen let's" | |
| }, | |
| { | |
| "start": 5952.52, | |
| "text": "retrain so the first thing that happened" | |
| }, | |
| { | |
| "start": 5954.44, | |
| "text": "is all the bite tokens disappeared right" | |
| }, | |
| { | |
| "start": 5957.28, | |
| "text": "and now we just have the merges and we" | |
| }, | |
| { | |
| "start": 5959.0, | |
| "text": "have a lot more merges now because we" | |
| }, | |
| { | |
| "start": 5960.48, | |
| "text": "have a lot more space because we're not" | |
| }, | |
| { | |
| "start": 5961.8, | |
| "text": "taking up space in the wab size uh with" | |
| }, | |
| { | |
| "start": 5965.04, | |
| "text": "all the" | |
| }, | |
| { | |
| "start": 5965.96, | |
| "text": "bytes and now if we encode" | |
| }, | |
| { | |
| "start": 5969.08, | |
| "text": "this we get a zero so this entire string" | |
| }, | |
| { | |
| "start": 5973.239, | |
| "text": "here suddenly there's no bitef back so" | |
| }, | |
| { | |
| "start": 5975.119, | |
| "text": "this is unknown and unknown is an and so" | |
| }, | |
| { | |
| "start": 5979.4, | |
| "text": "this is zero because the an token is" | |
| }, | |
| { | |
| "start": 5982.04, | |
| "text": "token zero and you have to keep in mind" | |
| }, | |
| { | |
| "start": 5984.92, | |
| "text": "that this would feed into your uh" | |
| }, | |
| { | |
| "start": 5986.88, | |
| "text": "language model so what is a language" | |
| }, | |
| { | |
| "start": 5988.4, | |
| "text": "model supposed to do when all kinds of" | |
| }, | |
| { | |
| "start": 5989.92, | |
| "text": "different things that are unrecognized" | |
| }, | |
| { | |
| "start": 5992.159, | |
| "text": "because they're rare just end up mapping" | |
| }, | |
| { | |
| "start": 5994.0, | |
| "text": "into Unk it's not exactly the property" | |
| }, | |
| { | |
| "start": 5996.119, | |
| "text": "that you want so that's why I think" | |
| }, | |
| { | |
| "start": 5997.76, | |
| "text": "llama correctly uh used by fallback true" | |
| }, | |
| { | |
| "start": 6002.04, | |
| "text": "uh because we definitely want to feed" | |
| }, | |
| { | |
| "start": 6003.719, | |
| "text": "these um unknown or rare code points" | |
| }, | |
| { | |
| "start": 6006.04, | |
| "text": "into the model and some uh some manner" | |
| }, | |
| { | |
| "start": 6008.56, | |
| "text": "the next thing I want to show you is the" | |
| }, | |
| { | |
| "start": 6010.679, | |
| "text": "following notice here when we are" | |
| }, | |
| { | |
| "start": 6012.48, | |
| "text": "decoding all the individual tokens you" | |
| }, | |
| { | |
| "start": 6014.719, | |
| "text": "see how spaces uh space here ends up" | |
| }, | |
| { | |
| "start": 6018.04, | |
| "text": "being this um bold underline I'm not" | |
| }, | |
| { | |
| "start": 6021.239, | |
| "text": "100% sure by the way why sentence piece" | |
| }, | |
| { | |
| "start": 6023.08, | |
| "text": "switches whites space into these bold" | |
| }, | |
| { | |
| "start": 6025.36, | |
| "text": "underscore characters maybe it's for" | |
| }, | |
| { | |
| "start": 6027.639, | |
| "text": "visualization I'm not 100% sure why that" | |
| }, | |
| { | |
| "start": 6029.88, | |
| "text": "happens uh but notice this why do we" | |
| }, | |
| { | |
| "start": 6032.44, | |
| "text": "have an extra space in the front of" | |
| }, | |
| { | |
| "start": 6037.44, | |
| "text": "hello um what where is this coming from" | |
| }, | |
| { | |
| "start": 6040.48, | |
| "text": "well it's coming from this option" | |
| }, | |
| { | |
| "start": 6043.159, | |
| "text": "here" | |
| }, | |
| { | |
| "start": 6045.04, | |
| "text": "um add dummy prefix is true and when you" | |
| }, | |
| { | |
| "start": 6048.36, | |
| "text": "go to the" | |
| }, | |
| { | |
| "start": 6049.56, | |
| "text": "documentation add D whites space at the" | |
| }, | |
| { | |
| "start": 6051.88, | |
| "text": "beginning of text in order to treat" | |
| }, | |
| { | |
| "start": 6053.36, | |
| "text": "World in world and hello world in the" | |
| }, | |
| { | |
| "start": 6055.92, | |
| "text": "exact same way so what this is trying to" | |
| }, | |
| { | |
| "start": 6057.96, | |
| "text": "do is the" | |
| }, | |
| { | |
| "start": 6059.239, | |
| "text": "following if we go back to our tick" | |
| }, | |
| { | |
| "start": 6062.04, | |
| "text": "tokenizer world as uh token by itself" | |
| }, | |
| { | |
| "start": 6066.32, | |
| "text": "has a different ID than space world so" | |
| }, | |
| { | |
| "start": 6070.239, | |
| "text": "we have this is 1917 but this is 14 Etc" | |
| }, | |
| { | |
| "start": 6074.599, | |
| "text": "so these are two different tokens for" | |
| }, | |
| { | |
| "start": 6076.0, | |
| "text": "the language model and the language" | |
| }, | |
| { | |
| "start": 6077.4, | |
| "text": "model has to learn from data that they" | |
| }, | |
| { | |
| "start": 6078.88, | |
| "text": "are actually kind of like a very similar" | |
| }, | |
| { | |
| "start": 6080.32, | |
| "text": "concept so to the language model in the" | |
| }, | |
| { | |
| "start": 6083.0, | |
| "text": "Tik token World um basically words in" | |
| }, | |
| { | |
| "start": 6086.0, | |
| "text": "the beginning of sentences and words in" | |
| }, | |
| { | |
| "start": 6087.639, | |
| "text": "the middle of sentences actually look" | |
| }, | |
| { | |
| "start": 6089.04, | |
| "text": "completely different um and it has to" | |
| }, | |
| { | |
| "start": 6092.04, | |
| "text": "learned that they are roughly the same" | |
| }, | |
| { | |
| "start": 6094.44, | |
| "text": "so this add dami prefix is trying to" | |
| }, | |
| { | |
| "start": 6096.92, | |
| "text": "fight that a little bit and the way that" | |
| }, | |
| { | |
| "start": 6098.96, | |
| "text": "works is that it basically" | |
| }, | |
| { | |
| "start": 6101.719, | |
| "text": "uh adds a dummy prefix so for as a as a" | |
| }, | |
| { | |
| "start": 6106.76, | |
| "text": "part of pre-processing it will take the" | |
| }, | |
| { | |
| "start": 6109.08, | |
| "text": "string and it will add a space it will" | |
| }, | |
| { | |
| "start": 6111.32, | |
| "text": "do this and that's done in an effort to" | |
| }, | |
| { | |
| "start": 6114.92, | |
| "text": "make this world and that world the same" | |
| }, | |
| { | |
| "start": 6117.52, | |
| "text": "they will both be space world so that's" | |
| }, | |
| { | |
| "start": 6120.28, | |
| "text": "one other kind of pre-processing option" | |
| }, | |
| { | |
| "start": 6122.159, | |
| "text": "that is turned on and llama 2 also uh" | |
| }, | |
| { | |
| "start": 6125.28, | |
| "text": "uses this option and that's I think" | |
| }, | |
| { | |
| "start": 6127.4, | |
| "text": "everything that I want to say for my" | |
| }, | |
| { | |
| "start": 6128.639, | |
| "text": "preview of sentence piece and how it is" | |
| }, | |
| { | |
| "start": 6130.44, | |
| "text": "different um maybe here what I've done" | |
| }, | |
| { | |
| "start": 6133.119, | |
| "text": "is I just uh put in the Raw protocol" | |
| }, | |
| { | |
| "start": 6136.719, | |
| "text": "buffer representation basically of the" | |
| }, | |
| { | |
| "start": 6139.84, | |
| "text": "tokenizer the too trained so feel free" | |
| }, | |
| { | |
| "start": 6142.88, | |
| "text": "to sort of Step through this and if you" | |
| }, | |
| { | |
| "start": 6144.76, | |
| "text": "would like uh your tokenization to look" | |
| }, | |
| { | |
| "start": 6147.0, | |
| "text": "identical to that of the meta uh llama 2" | |
| }, | |
| { | |
| "start": 6150.32, | |
| "text": "then you would be copy pasting these" | |
| }, | |
| { | |
| "start": 6151.679, | |
| "text": "settings as I tried to do up above and" | |
| }, | |
| { | |
| "start": 6154.76, | |
| "text": "uh yeah that's I think that's it for" | |
| }, | |
| { | |
| "start": 6156.96, | |
| "text": "this section I think my summary for" | |
| }, | |
| { | |
| "start": 6158.88, | |
| "text": "sentence piece from all of this is" | |
| }, | |
| { | |
| "start": 6160.8, | |
| "text": "number one I think that there's a lot of" | |
| }, | |
| { | |
| "start": 6162.44, | |
| "text": "historical baggage in sentence piece a" | |
| }, | |
| { | |
| "start": 6164.28, | |
| "text": "lot of Concepts that I think are" | |
| }, | |
| { | |
| "start": 6165.679, | |
| "text": "slightly confusing and I think" | |
| }, | |
| { | |
| "start": 6167.239, | |
| "text": "potentially um contain foot guns like" | |
| }, | |
| { | |
| "start": 6169.4, | |
| "text": "this concept of a sentence and it's" | |
| }, | |
| { | |
| "start": 6170.8, | |
| "text": "maximum length and stuff like that um" | |
| }, | |
| { | |
| "start": 6173.719, | |
| "text": "otherwise it is fairly commonly used in" | |
| }, | |
| { | |
| "start": 6175.88, | |
| "text": "the industry um because it is efficient" | |
| }, | |
| { | |
| "start": 6178.88, | |
| "text": "and can do both training and inference" | |
| }, | |
| { | |
| "start": 6181.0, | |
| "text": "uh it has a few quirks like for example" | |
| }, | |
| { | |
| "start": 6182.76, | |
| "text": "un token must exist and the way the bite" | |
| }, | |
| { | |
| "start": 6185.08, | |
| "text": "fallbacks are done and so on I don't" | |
| }, | |
| { | |
| "start": 6186.56, | |
| "text": "find particularly elegant and" | |
| }, | |
| { | |
| "start": 6188.36, | |
| "text": "unfortunately I have to say it's not" | |
| }, | |
| { | |
| "start": 6189.56, | |
| "text": "very well documented so it took me a lot" | |
| }, | |
| { | |
| "start": 6191.44, | |
| "text": "of time working with this myself um and" | |
| }, | |
| { | |
| "start": 6194.76, | |
| "text": "just visualizing things and trying to" | |
| }, | |
| { | |
| "start": 6196.159, | |
| "text": "really understand what is happening here" | |
| }, | |
| { | |
| "start": 6197.8, | |
| "text": "because uh the documentation" | |
| }, | |
| { | |
| "start": 6199.28, | |
| "text": "unfortunately is in my opion not not" | |
| }, | |
| { | |
| "start": 6201.44, | |
| "text": "super amazing but it is a very nice repo" | |
| }, | |
| { | |
| "start": 6204.679, | |
| "text": "that is available to you if you'd like" | |
| }, | |
| { | |
| "start": 6206.159, | |
| "text": "to train your own tokenizer right now" | |
| }, | |
| { | |
| "start": 6208.199, | |
| "text": "okay let me now switch gears again as" | |
| }, | |
| { | |
| "start": 6209.639, | |
| "text": "we're starting to slowly wrap up here I" | |
| }, | |
| { | |
| "start": 6211.719, | |
| "text": "want to revisit this issue in a bit more" | |
| }, | |
| { | |
| "start": 6213.36, | |
| "text": "detail of how we should set the vocap" | |
| }, | |
| { | |
| "start": 6215.32, | |
| "text": "size and what are some of the" | |
| }, | |
| { | |
| "start": 6216.199, | |
| "text": "considerations around it so for this I'd" | |
| }, | |
| { | |
| "start": 6219.639, | |
| "text": "like to go back to the model" | |
| }, | |
| { | |
| "start": 6220.84, | |
| "text": "architecture that we developed in the" | |
| }, | |
| { | |
| "start": 6222.159, | |
| "text": "last video when we built the GPT from" | |
| }, | |
| { | |
| "start": 6224.679, | |
| "text": "scratch so this here was uh the file" | |
| }, | |
| { | |
| "start": 6227.4, | |
| "text": "that we built in the previous video and" | |
| }, | |
| { | |
| "start": 6229.08, | |
| "text": "we defined the Transformer model and and" | |
| }, | |
| { | |
| "start": 6231.32, | |
| "text": "let's specifically look at Bap size and" | |
| }, | |
| { | |
| "start": 6232.88, | |
| "text": "where it appears in this file so here we" | |
| }, | |
| { | |
| "start": 6235.199, | |
| "text": "Define the voap size uh at this time it" | |
| }, | |
| { | |
| "start": 6238.159, | |
| "text": "was 65 or something like that extremely" | |
| }, | |
| { | |
| "start": 6239.96, | |
| "text": "small number so this will grow much" | |
| }, | |
| { | |
| "start": 6242.08, | |
| "text": "larger you'll see that Bap size doesn't" | |
| }, | |
| { | |
| "start": 6244.28, | |
| "text": "come up too much in most of these layers" | |
| }, | |
| { | |
| "start": 6246.159, | |
| "text": "the only place that it comes up to is in" | |
| }, | |
| { | |
| "start": 6248.52, | |
| "text": "exactly these two places here so when we" | |
| }, | |
| { | |
| "start": 6251.48, | |
| "text": "Define the language model there's the" | |
| }, | |
| { | |
| "start": 6253.56, | |
| "text": "token embedding table which is this" | |
| }, | |
| { | |
| "start": 6255.8, | |
| "text": "two-dimensional array where the vocap" | |
| }, | |
| { | |
| "start": 6258.08, | |
| "text": "size is basically the number of rows and" | |
| }, | |
| { | |
| "start": 6261.199, | |
| "text": "uh each vocabulary element each token" | |
| }, | |
| { | |
| "start": 6263.92, | |
| "text": "has a vector that we're going to train" | |
| }, | |
| { | |
| "start": 6265.92, | |
| "text": "using back propagation that Vector is of" | |
| }, | |
| { | |
| "start": 6267.96, | |
| "text": "size and embed which is number of" | |
| }, | |
| { | |
| "start": 6269.44, | |
| "text": "channels in the Transformer and" | |
| }, | |
| { | |
| "start": 6271.599, | |
| "text": "basically as voap size increases this" | |
| }, | |
| { | |
| "start": 6273.679, | |
| "text": "embedding table as I mentioned earlier" | |
| }, | |
| { | |
| "start": 6275.679, | |
| "text": "is going to also grow we're going to be" | |
| }, | |
| { | |
| "start": 6277.0, | |
| "text": "adding rows in addition to that at the" | |
| }, | |
| { | |
| "start": 6279.719, | |
| "text": "end of the Transformer there's this LM" | |
| }, | |
| { | |
| "start": 6281.88, | |
| "text": "head layer which is a linear layer and" | |
| }, | |
| { | |
| "start": 6284.239, | |
| "text": "you'll notice that that layer is used at" | |
| }, | |
| { | |
| "start": 6286.28, | |
| "text": "the very end to produce the logits uh" | |
| }, | |
| { | |
| "start": 6288.639, | |
| "text": "which become the probabilities for the" | |
| }, | |
| { | |
| "start": 6289.96, | |
| "text": "next token in sequence and so" | |
| }, | |
| { | |
| "start": 6291.76, | |
| "text": "intuitively we're trying to produce a" | |
| }, | |
| { | |
| "start": 6293.92, | |
| "text": "probability for every single token that" | |
| }, | |
| { | |
| "start": 6296.239, | |
| "text": "might come next at every point in time" | |
| }, | |
| { | |
| "start": 6298.84, | |
| "text": "of that Transformer and if we have more" | |
| }, | |
| { | |
| "start": 6301.08, | |
| "text": "and more tokens we need to produce more" | |
| }, | |
| { | |
| "start": 6302.679, | |
| "text": "and more probabilities so every single" | |
| }, | |
| { | |
| "start": 6304.92, | |
| "text": "token is going to introduce an" | |
| }, | |
| { | |
| "start": 6306.199, | |
| "text": "additional dot product that we have to" | |
| }, | |
| { | |
| "start": 6308.159, | |
| "text": "do here in this linear layer for this" | |
| }, | |
| { | |
| "start": 6310.199, | |
| "text": "final layer in a" | |
| }, | |
| { | |
| "start": 6311.44, | |
| "text": "Transformer so why can't vocap size be" | |
| }, | |
| { | |
| "start": 6314.56, | |
| "text": "infinite why can't we grow to Infinity" | |
| }, | |
| { | |
| "start": 6316.52, | |
| "text": "well number one your token embedding" | |
| }, | |
| { | |
| "start": 6318.199, | |
| "text": "table is going to grow uh your linear" | |
| }, | |
| { | |
| "start": 6321.56, | |
| "text": "layer is going to grow so we're going to" | |
| }, | |
| { | |
| "start": 6323.599, | |
| "text": "be doing a lot more computation here" | |
| }, | |
| { | |
| "start": 6325.119, | |
| "text": "because this LM head layer will become" | |
| }, | |
| { | |
| "start": 6326.56, | |
| "text": "more computational expensive number two" | |
| }, | |
| { | |
| "start": 6329.119, | |
| "text": "because we have more parameters we could" | |
| }, | |
| { | |
| "start": 6330.84, | |
| "text": "be worried that we are going to be under" | |
| }, | |
| { | |
| "start": 6333.44, | |
| "text": "trining some of these" | |
| }, | |
| { | |
| "start": 6335.199, | |
| "text": "parameters so intuitively if you have a" | |
| }, | |
| { | |
| "start": 6337.4, | |
| "text": "very large vocabulary size say we have a" | |
| }, | |
| { | |
| "start": 6338.96, | |
| "text": "million uh tokens then every one of" | |
| }, | |
| { | |
| "start": 6341.32, | |
| "text": "these tokens is going to come up more" | |
| }, | |
| { | |
| "start": 6342.679, | |
| "text": "and more rarely in the training data" | |
| }, | |
| { | |
| "start": 6345.04, | |
| "text": "because there's a lot more other tokens" | |
| }, | |
| { | |
| "start": 6346.52, | |
| "text": "all over the place and so we're going to" | |
| }, | |
| { | |
| "start": 6348.56, | |
| "text": "be seeing fewer and fewer examples uh" | |
| }, | |
| { | |
| "start": 6351.0, | |
| "text": "for each individual token and you might" | |
| }, | |
| { | |
| "start": 6353.28, | |
| "text": "be worried that basically the vectors" | |
| }, | |
| { | |
| "start": 6355.0, | |
| "text": "associated with every token will be" | |
| }, | |
| { | |
| "start": 6356.28, | |
| "text": "undertrained as a result because they" | |
| }, | |
| { | |
| "start": 6358.28, | |
| "text": "just don't come up too often and they" | |
| }, | |
| { | |
| "start": 6359.92, | |
| "text": "don't participate in the forward" | |
| }, | |
| { | |
| "start": 6360.96, | |
| "text": "backward pass in addition to that as" | |
| }, | |
| { | |
| "start": 6363.199, | |
| "text": "your vocab size grows you're going to" | |
| }, | |
| { | |
| "start": 6364.88, | |
| "text": "start shrinking your sequences a lot" | |
| }, | |
| { | |
| "start": 6367.04, | |
| "text": "right and that's really nice because" | |
| }, | |
| { | |
| "start": 6369.32, | |
| "text": "that means that we're going to be" | |
| }, | |
| { | |
| "start": 6370.119, | |
| "text": "attending to more and more text so" | |
| }, | |
| { | |
| "start": 6372.0, | |
| "text": "that's nice but also you might be" | |
| }, | |
| { | |
| "start": 6373.599, | |
| "text": "worrying that two large of chunks are" | |
| }, | |
| { | |
| "start": 6375.92, | |
| "text": "being squished into single tokens and so" | |
| }, | |
| { | |
| "start": 6378.56, | |
| "text": "the model just doesn't have as much of" | |
| }, | |
| { | |
| "start": 6380.719, | |
| "text": "time to think per sort of um some number" | |
| }, | |
| { | |
| "start": 6385.08, | |
| "text": "of characters in the text or you can" | |
| }, | |
| { | |
| "start": 6386.679, | |
| "text": "think about it that way right so" | |
| }, | |
| { | |
| "start": 6388.08, | |
| "text": "basically we're squishing too much" | |
| }, | |
| { | |
| "start": 6389.48, | |
| "text": "information into a single token and then" | |
| }, | |
| { | |
| "start": 6391.639, | |
| "text": "the forward pass of the Transformer is" | |
| }, | |
| { | |
| "start": 6393.04, | |
| "text": "not enough to actually process that" | |
| }, | |
| { | |
| "start": 6394.4, | |
| "text": "information appropriately and so these" | |
| }, | |
| { | |
| "start": 6396.44, | |
| "text": "are some of the considerations you're" | |
| }, | |
| { | |
| "start": 6397.48, | |
| "text": "thinking about when you're designing the" | |
| }, | |
| { | |
| "start": 6398.639, | |
| "text": "vocab size as I mentioned this is mostly" | |
| }, | |
| { | |
| "start": 6400.639, | |
| "text": "an empirical hyperparameter and it seems" | |
| }, | |
| { | |
| "start": 6402.88, | |
| "text": "like in state-of-the-art architectures" | |
| }, | |
| { | |
| "start": 6404.239, | |
| "text": "today this is usually in the high 10,000" | |
| }, | |
| { | |
| "start": 6406.76, | |
| "text": "or somewhere around 100,000 today and" | |
| }, | |
| { | |
| "start": 6409.36, | |
| "text": "the next consideration I want to briefly" | |
| }, | |
| { | |
| "start": 6410.88, | |
| "text": "talk about is what if we want to take a" | |
| }, | |
| { | |
| "start": 6413.0, | |
| "text": "pre-trained model and we want to extend" | |
| }, | |
| { | |
| "start": 6415.199, | |
| "text": "the vocap size and this is done fairly" | |
| }, | |
| { | |
| "start": 6417.36, | |
| "text": "commonly actually so for example when" | |
| }, | |
| { | |
| "start": 6418.88, | |
| "text": "you're doing fine-tuning for cha GPT um" | |
| }, | |
| { | |
| "start": 6422.159, | |
| "text": "a lot more new special tokens get" | |
| }, | |
| { | |
| "start": 6423.76, | |
| "text": "introduced on top of the base model to" | |
| }, | |
| { | |
| "start": 6425.8, | |
| "text": "maintain the metadata and all the" | |
| }, | |
| { | |
| "start": 6428.04, | |
| "text": "structure of conversation objects" | |
| }, | |
| { | |
| "start": 6429.88, | |
| "text": "between a user and an assistant so that" | |
| }, | |
| { | |
| "start": 6431.92, | |
| "text": "takes a lot of special tokens you might" | |
| }, | |
| { | |
| "start": 6434.04, | |
| "text": "also try to throw in more special tokens" | |
| }, | |
| { | |
| "start": 6435.88, | |
| "text": "for example for using the browser or any" | |
| }, | |
| { | |
| "start": 6437.8, | |
| "text": "other tool and so it's very tempting to" | |
| }, | |
| { | |
| "start": 6440.639, | |
| "text": "add a lot of tokens for all kinds of" | |
| }, | |
| { | |
| "start": 6442.159, | |
| "text": "special functionality so if you want to" | |
| }, | |
| { | |
| "start": 6444.52, | |
| "text": "be adding a token that's totally" | |
| }, | |
| { | |
| "start": 6445.8, | |
| "text": "possible Right all we have to do is we" | |
| }, | |
| { | |
| "start": 6447.719, | |
| "text": "have to resize this embedding so we have" | |
| }, | |
| { | |
| "start": 6449.88, | |
| "text": "to add rows we would initialize these uh" | |
| }, | |
| { | |
| "start": 6452.48, | |
| "text": "parameters from scratch to be small" | |
| }, | |
| { | |
| "start": 6454.44, | |
| "text": "random numbers and then we have to" | |
| }, | |
| { | |
| "start": 6456.119, | |
| "text": "extend the weight inside this linear uh" | |
| }, | |
| { | |
| "start": 6459.28, | |
| "text": "so we have to start making dot products" | |
| }, | |
| { | |
| "start": 6461.44, | |
| "text": "um with the associated parameters as" | |
| }, | |
| { | |
| "start": 6463.199, | |
| "text": "well to basically calculate the" | |
| }, | |
| { | |
| "start": 6464.56, | |
| "text": "probabilities for these new tokens so" | |
| }, | |
| { | |
| "start": 6466.76, | |
| "text": "both of these are just a resizing" | |
| }, | |
| { | |
| "start": 6468.639, | |
| "text": "operation it's a very mild" | |
| }, | |
| { | |
| "start": 6470.84, | |
| "text": "model surgery and can be done fairly" | |
| }, | |
| { | |
| "start": 6472.599, | |
| "text": "easily and it's quite common that" | |
| }, | |
| { | |
| "start": 6474.04, | |
| "text": "basically you would freeze the base" | |
| }, | |
| { | |
| "start": 6475.36, | |
| "text": "model you introduce these new parameters" | |
| }, | |
| { | |
| "start": 6477.44, | |
| "text": "and then you only train these new" | |
| }, | |
| { | |
| "start": 6478.639, | |
| "text": "parameters to introduce new tokens into" | |
| }, | |
| { | |
| "start": 6480.56, | |
| "text": "the architecture um and so you can" | |
| }, | |
| { | |
| "start": 6483.119, | |
| "text": "freeze arbitrary parts of it or you can" | |
| }, | |
| { | |
| "start": 6484.96, | |
| "text": "train arbitrary parts of it and that's" | |
| }, | |
| { | |
| "start": 6486.4, | |
| "text": "totally up to you but basically minor" | |
| }, | |
| { | |
| "start": 6488.32, | |
| "text": "surgery required if you'd like to" | |
| }, | |
| { | |
| "start": 6490.119, | |
| "text": "introduce new tokens and finally I'd" | |
| }, | |
| { | |
| "start": 6491.88, | |
| "text": "like to mention that actually there's an" | |
| }, | |
| { | |
| "start": 6493.36, | |
| "text": "entire design space of applications in" | |
| }, | |
| { | |
| "start": 6495.92, | |
| "text": "terms of introducing new tokens into a" | |
| }, | |
| { | |
| "start": 6497.639, | |
| "text": "vocabulary that go Way Beyond just" | |
| }, | |
| { | |
| "start": 6499.36, | |
| "text": "adding special tokens and special new" | |
| }, | |
| { | |
| "start": 6501.199, | |
| "text": "functionality so just to give you a" | |
| }, | |
| { | |
| "start": 6503.0, | |
| "text": "sense of the design space but this could" | |
| }, | |
| { | |
| "start": 6504.36, | |
| "text": "be an entire video just by itself uh" | |
| }, | |
| { | |
| "start": 6506.599, | |
| "text": "this is a paper on learning to compress" | |
| }, | |
| { | |
| "start": 6508.639, | |
| "text": "prompts with what they called uh gist" | |
| }, | |
| { | |
| "start": 6511.04, | |
| "text": "tokens and the rough idea is suppose" | |
| }, | |
| { | |
| "start": 6513.4, | |
| "text": "that you're using language models in a" | |
| }, | |
| { | |
| "start": 6514.679, | |
| "text": "setting that requires very long prompts" | |
| }, | |
| { | |
| "start": 6517.159, | |
| "text": "while these long prompts just slow" | |
| }, | |
| { | |
| "start": 6518.8, | |
| "text": "everything down because you have to" | |
| }, | |
| { | |
| "start": 6519.84, | |
| "text": "encode them and then you have to use" | |
| }, | |
| { | |
| "start": 6521.4, | |
| "text": "them and then you're tending over them" | |
| }, | |
| { | |
| "start": 6523.119, | |
| "text": "and it's just um you know heavy to have" | |
| }, | |
| { | |
| "start": 6525.119, | |
| "text": "very large prompts so instead what they" | |
| }, | |
| { | |
| "start": 6527.639, | |
| "text": "do here in this paper is they introduce" | |
| }, | |
| { | |
| "start": 6530.679, | |
| "text": "new tokens and um imagine basically" | |
| }, | |
| { | |
| "start": 6534.56, | |
| "text": "having a few new tokens you put them in" | |
| }, | |
| { | |
| "start": 6536.4, | |
| "text": "a sequence and then you train the model" | |
| }, | |
| { | |
| "start": 6539.36, | |
| "text": "by distillation so you are keeping the" | |
| }, | |
| { | |
| "start": 6541.52, | |
| "text": "entire model Frozen and you're only" | |
| }, | |
| { | |
| "start": 6543.159, | |
| "text": "training the representations of the new" | |
| }, | |
| { | |
| "start": 6545.0, | |
| "text": "tokens their embeddings and you're" | |
| }, | |
| { | |
| "start": 6546.96, | |
| "text": "optimizing over the new tokens such that" | |
| }, | |
| { | |
| "start": 6549.44, | |
| "text": "the behavior of the language model is" | |
| }, | |
| { | |
| "start": 6551.92, | |
| "text": "identical uh to the model that has a" | |
| }, | |
| { | |
| "start": 6555.04, | |
| "text": "very long prompt that works for you and" | |
| }, | |
| { | |
| "start": 6557.679, | |
| "text": "so it's a compression technique of" | |
| }, | |
| { | |
| "start": 6559.0, | |
| "text": "compressing that very long prompt into" | |
| }, | |
| { | |
| "start": 6560.8, | |
| "text": "those few new gist tokens and so you can" | |
| }, | |
| { | |
| "start": 6563.8, | |
| "text": "train this and then at test time you can" | |
| }, | |
| { | |
| "start": 6565.04, | |
| "text": "discard your old prompt and just swap in" | |
| }, | |
| { | |
| "start": 6566.719, | |
| "text": "those tokens and they sort of like uh" | |
| }, | |
| { | |
| "start": 6568.639, | |
| "text": "stand in for that very long prompt and" | |
| }, | |
| { | |
| "start": 6571.119, | |
| "text": "have an almost identical performance and" | |
| }, | |
| { | |
| "start": 6573.679, | |
| "text": "so this is one um technique and a class" | |
| }, | |
| { | |
| "start": 6576.48, | |
| "text": "of parameter efficient fine-tuning" | |
| }, | |
| { | |
| "start": 6578.0, | |
| "text": "techniques where most of the model is" | |
| }, | |
| { | |
| "start": 6579.92, | |
| "text": "basically fixed and there's no training" | |
| }, | |
| { | |
| "start": 6581.88, | |
| "text": "of the model weights there's no training" | |
| }, | |
| { | |
| "start": 6583.599, | |
| "text": "of Laura or anything like that of new" | |
| }, | |
| { | |
| "start": 6585.44, | |
| "text": "parameters the the parameters that" | |
| }, | |
| { | |
| "start": 6587.239, | |
| "text": "you're training are now just the uh" | |
| }, | |
| { | |
| "start": 6589.119, | |
| "text": "token embeddings so that's just one" | |
| }, | |
| { | |
| "start": 6591.199, | |
| "text": "example but this could again be like an" | |
| }, | |
| { | |
| "start": 6592.88, | |
| "text": "entire video but just to give you a" | |
| }, | |
| { | |
| "start": 6594.52, | |
| "text": "sense that there's a whole design space" | |
| }, | |
| { | |
| "start": 6595.76, | |
| "text": "here that is potentially worth exploring" | |
| }, | |
| { | |
| "start": 6597.36, | |
| "text": "in the future the next thing I want to" | |
| }, | |
| { | |
| "start": 6599.199, | |
| "text": "briefly address is that I think recently" | |
| }, | |
| { | |
| "start": 6601.199, | |
| "text": "there's a lot of momentum in how you" | |
| }, | |
| { | |
| "start": 6603.08, | |
| "text": "actually could construct Transformers" | |
| }, | |
| { | |
| "start": 6605.08, | |
| "text": "that can simultaneously process not just" | |
| }, | |
| { | |
| "start": 6606.8, | |
| "text": "text as the input modality but a lot of" | |
| }, | |
| { | |
| "start": 6608.84, | |
| "text": "other modalities so be it images videos" | |
| }, | |
| { | |
| "start": 6611.52, | |
| "text": "audio Etc and how do you feed in all" | |
| }, | |
| { | |
| "start": 6614.28, | |
| "text": "these modalities and potentially predict" | |
| }, | |
| { | |
| "start": 6616.0, | |
| "text": "these modalities from a Transformer uh" | |
| }, | |
| { | |
| "start": 6618.84, | |
| "text": "do you have to change the architecture" | |
| }, | |
| { | |
| "start": 6619.84, | |
| "text": "in some fundamental way and I think what" | |
| }, | |
| { | |
| "start": 6621.599, | |
| "text": "a lot of people are starting to converge" | |
| }, | |
| { | |
| "start": 6623.119, | |
| "text": "towards is that you're not changing the" | |
| }, | |
| { | |
| "start": 6624.28, | |
| "text": "architecture you stick with the" | |
| }, | |
| { | |
| "start": 6625.44, | |
| "text": "Transformer you just kind of tokenize" | |
| }, | |
| { | |
| "start": 6627.56, | |
| "text": "your input domains and then call the day" | |
| }, | |
| { | |
| "start": 6629.96, | |
| "text": "and pretend it's just text tokens and" | |
| }, | |
| { | |
| "start": 6631.52, | |
| "text": "just do everything else identical in an" | |
| }, | |
| { | |
| "start": 6633.96, | |
| "text": "identical manner so here for example" | |
| }, | |
| { | |
| "start": 6636.08, | |
| "text": "there was a early paper that has nice" | |
| }, | |
| { | |
| "start": 6637.56, | |
| "text": "graphic for how you can take an image" | |
| }, | |
| { | |
| "start": 6639.599, | |
| "text": "and you can chunc at it into" | |
| }, | |
| { | |
| "start": 6642.159, | |
| "text": "integers um and these sometimes uh so" | |
| }, | |
| { | |
| "start": 6645.4, | |
| "text": "these will basically become the tokens" | |
| }, | |
| { | |
| "start": 6646.84, | |
| "text": "of images as an example and uh these" | |
| }, | |
| { | |
| "start": 6649.56, | |
| "text": "tokens can be uh hard tokens where you" | |
| }, | |
| { | |
| "start": 6652.199, | |
| "text": "force them to be integers they can also" | |
| }, | |
| { | |
| "start": 6653.92, | |
| "text": "be soft tokens where you uh sort of" | |
| }, | |
| { | |
| "start": 6657.0, | |
| "text": "don't require uh these to be discrete" | |
| }, | |
| { | |
| "start": 6660.239, | |
| "text": "but you do Force these representations" | |
| }, | |
| { | |
| "start": 6662.159, | |
| "text": "to go through bottlenecks like in Auto" | |
| }, | |
| { | |
| "start": 6664.76, | |
| "text": "encoders uh also in this paper that came" | |
| }, | |
| { | |
| "start": 6666.92, | |
| "text": "out from open a SORA which I think" | |
| }, | |
| { | |
| "start": 6668.88, | |
| "text": "really um uh blew the mind of many" | |
| }, | |
| { | |
| "start": 6671.84, | |
| "text": "people and inspired a lot of people in" | |
| }, | |
| { | |
| "start": 6673.52, | |
| "text": "terms of what's possible they have a" | |
| }, | |
| { | |
| "start": 6675.199, | |
| "text": "Graphic here and they talk briefly about" | |
| }, | |
| { | |
| "start": 6676.92, | |
| "text": "how llms have text tokens Sora has" | |
| }, | |
| { | |
| "start": 6680.159, | |
| "text": "visual patches so again they came up" | |
| }, | |
| { | |
| "start": 6682.52, | |
| "text": "with a way to chunc a videos into" | |
| }, | |
| { | |
| "start": 6684.92, | |
| "text": "basically tokens when they own" | |
| }, | |
| { | |
| "start": 6686.52, | |
| "text": "vocabularies and then you can either" | |
| }, | |
| { | |
| "start": 6688.52, | |
| "text": "process discrete tokens say with autog" | |
| }, | |
| { | |
| "start": 6690.04, | |
| "text": "regressive models or even soft tokens" | |
| }, | |
| { | |
| "start": 6692.079, | |
| "text": "with diffusion models and uh all of that" | |
| }, | |
| { | |
| "start": 6695.239, | |
| "text": "is sort of uh being actively worked on" | |
| }, | |
| { | |
| "start": 6698.239, | |
| "text": "designed on and is beyond the scope of" | |
| }, | |
| { | |
| "start": 6699.639, | |
| "text": "this video but just something I wanted" | |
| }, | |
| { | |
| "start": 6700.88, | |
| "text": "to mention briefly okay now that we have" | |
| }, | |
| { | |
| "start": 6702.96, | |
| "text": "come quite deep into the tokenization" | |
| }, | |
| { | |
| "start": 6705.119, | |
| "text": "algorithm and we understand a lot more" | |
| }, | |
| { | |
| "start": 6706.76, | |
| "text": "about how it works let's loop back" | |
| }, | |
| { | |
| "start": 6708.92, | |
| "text": "around to the beginning of this video" | |
| }, | |
| { | |
| "start": 6710.52, | |
| "text": "and go through some of these bullet" | |
| }, | |
| { | |
| "start": 6711.599, | |
| "text": "points and really see why they happen so" | |
| }, | |
| { | |
| "start": 6714.88, | |
| "text": "first of all why can't my llm spell" | |
| }, | |
| { | |
| "start": 6716.96, | |
| "text": "words very well or do other spell" | |
| }, | |
| { | |
| "start": 6718.76, | |
| "text": "related" | |
| }, | |
| { | |
| "start": 6720.56, | |
| "text": "tasks so fundamentally this is because" | |
| }, | |
| { | |
| "start": 6722.92, | |
| "text": "as we saw these characters are chunked" | |
| }, | |
| { | |
| "start": 6725.679, | |
| "text": "up into tokens and some of these tokens" | |
| }, | |
| { | |
| "start": 6727.96, | |
| "text": "are actually fairly long so as an" | |
| }, | |
| { | |
| "start": 6730.4, | |
| "text": "example I went to the gp4 vocabulary and" | |
| }, | |
| { | |
| "start": 6732.8, | |
| "text": "I looked at uh one of the longer tokens" | |
| }, | |
| { | |
| "start": 6735.28, | |
| "text": "so that default style turns out to be a" | |
| }, | |
| { | |
| "start": 6737.88, | |
| "text": "single individual token so that's a lot" | |
| }, | |
| { | |
| "start": 6739.719, | |
| "text": "of characters for a single token so my" | |
| }, | |
| { | |
| "start": 6742.159, | |
| "text": "suspicion is that there's just too much" | |
| }, | |
| { | |
| "start": 6743.76, | |
| "text": "crammed into this single token and my" | |
| }, | |
| { | |
| "start": 6746.079, | |
| "text": "suspicion was that the model should not" | |
| }, | |
| { | |
| "start": 6747.76, | |
| "text": "be very good at tasks related to" | |
| }, | |
| { | |
| "start": 6750.36, | |
| "text": "spelling of this uh single token so I" | |
| }, | |
| { | |
| "start": 6754.679, | |
| "text": "asked how many letters L are there in" | |
| }, | |
| { | |
| "start": 6757.0, | |
| "text": "the word default style and of course my" | |
| }, | |
| { | |
| "start": 6761.48, | |
| "text": "prompt is intentionally done that way" | |
| }, | |
| { | |
| "start": 6764.36, | |
| "text": "and you see how default style will be a" | |
| }, | |
| { | |
| "start": 6765.76, | |
| "text": "single token so this is what the model" | |
| }, | |
| { | |
| "start": 6767.36, | |
| "text": "sees so my suspicion is that it wouldn't" | |
| }, | |
| { | |
| "start": 6769.4, | |
| "text": "be very good at this and indeed it is" | |
| }, | |
| { | |
| "start": 6771.32, | |
| "text": "not it doesn't actually know how many" | |
| }, | |
| { | |
| "start": 6773.159, | |
| "text": "L's are in there it thinks there are" | |
| }, | |
| { | |
| "start": 6774.639, | |
| "text": "three and actually there are four if I'm" | |
| }, | |
| { | |
| "start": 6777.0, | |
| "text": "not getting this wrong myself so that" | |
| }, | |
| { | |
| "start": 6779.639, | |
| "text": "didn't go extremely well let's look look" | |
| }, | |
| { | |
| "start": 6782.32, | |
| "text": "at another kind of uh character level" | |
| }, | |
| { | |
| "start": 6784.599, | |
| "text": "task so for example here I asked uh gp4" | |
| }, | |
| { | |
| "start": 6788.4, | |
| "text": "to reverse the string default style and" | |
| }, | |
| { | |
| "start": 6791.159, | |
| "text": "they tried to use a code interpreter and" | |
| }, | |
| { | |
| "start": 6793.199, | |
| "text": "I stopped it and I said just do it just" | |
| }, | |
| { | |
| "start": 6795.44, | |
| "text": "try it and uh it gave me jumble so it" | |
| }, | |
| { | |
| "start": 6799.56, | |
| "text": "doesn't actually really know how to" | |
| }, | |
| { | |
| "start": 6801.44, | |
| "text": "reverse this string going from right to" | |
| }, | |
| { | |
| "start": 6803.76, | |
| "text": "left uh so it gave a wrong result so" | |
| }, | |
| { | |
| "start": 6806.76, | |
| "text": "again like working with this working" | |
| }, | |
| { | |
| "start": 6808.32, | |
| "text": "hypothesis that maybe this is due to the" | |
| }, | |
| { | |
| "start": 6810.0, | |
| "text": "tokenization I tried a different" | |
| }, | |
| { | |
| "start": 6811.84, | |
| "text": "approach I said okay let's reverse the" | |
| }, | |
| { | |
| "start": 6814.119, | |
| "text": "exact same string but take the following" | |
| }, | |
| { | |
| "start": 6816.44, | |
| "text": "approach step one just print out every" | |
| }, | |
| { | |
| "start": 6818.679, | |
| "text": "single character separated by spaces and" | |
| }, | |
| { | |
| "start": 6820.719, | |
| "text": "then as a step two reverse that list and" | |
| }, | |
| { | |
| "start": 6823.28, | |
| "text": "it again Tred to use a tool but when I" | |
| }, | |
| { | |
| "start": 6824.8, | |
| "text": "stopped it it uh first uh produced all" | |
| }, | |
| { | |
| "start": 6827.76, | |
| "text": "the characters and that was actually" | |
| }, | |
| { | |
| "start": 6828.92, | |
| "text": "correct and then It reversed them and" | |
| }, | |
| { | |
| "start": 6830.92, | |
| "text": "that was correct once it had this so" | |
| }, | |
| { | |
| "start": 6833.04, | |
| "text": "somehow it can't reverse it directly but" | |
| }, | |
| { | |
| "start": 6834.88, | |
| "text": "when you go just first uh you know" | |
| }, | |
| { | |
| "start": 6837.4, | |
| "text": "listing it out in order it can do that" | |
| }, | |
| { | |
| "start": 6839.28, | |
| "text": "somehow and then it can once it's uh" | |
| }, | |
| { | |
| "start": 6841.88, | |
| "text": "broken up this way this becomes all" | |
| }, | |
| { | |
| "start": 6843.88, | |
| "text": "these individual characters and so now" | |
| }, | |
| { | |
| "start": 6846.04, | |
| "text": "this is much easier for it to see these" | |
| }, | |
| { | |
| "start": 6847.88, | |
| "text": "individual tokens and reverse them and" | |
| }, | |
| { | |
| "start": 6850.079, | |
| "text": "print them out so that is kind of" | |
| }, | |
| { | |
| "start": 6853.52, | |
| "text": "interesting so let's continue now why" | |
| }, | |
| { | |
| "start": 6856.84, | |
| "text": "are llms worse at uh non-english langu" | |
| }, | |
| { | |
| "start": 6860.4, | |
| "text": "and I briefly covered this already but" | |
| }, | |
| { | |
| "start": 6862.679, | |
| "text": "basically um it's not only that the" | |
| }, | |
| { | |
| "start": 6864.88, | |
| "text": "language model sees less non-english" | |
| }, | |
| { | |
| "start": 6867.159, | |
| "text": "data during training of the model" | |
| }, | |
| { | |
| "start": 6868.76, | |
| "text": "parameters but also the tokenizer is not" | |
| }, | |
| { | |
| "start": 6871.639, | |
| "text": "um is not sufficiently trained on" | |
| }, | |
| { | |
| "start": 6874.639, | |
| "text": "non-english data and so here for example" | |
| }, | |
| { | |
| "start": 6877.28, | |
| "text": "hello how are you is five tokens and its" | |
| }, | |
| { | |
| "start": 6880.52, | |
| "text": "translation is 15 tokens so this is a" | |
| }, | |
| { | |
| "start": 6882.88, | |
| "text": "three times blow up and so for example" | |
| }, | |
| { | |
| "start": 6885.8, | |
| "text": "anang is uh just hello basically in" | |
| }, | |
| { | |
| "start": 6888.639, | |
| "text": "Korean and that end up being three" | |
| }, | |
| { | |
| "start": 6890.32, | |
| "text": "tokens I'm actually kind of surprised by" | |
| }, | |
| { | |
| "start": 6891.8, | |
| "text": "that because that is a very common" | |
| }, | |
| { | |
| "start": 6893.119, | |
| "text": "phrase there just the typical greeting" | |
| }, | |
| { | |
| "start": 6895.159, | |
| "text": "of like hello and that ends up being" | |
| }, | |
| { | |
| "start": 6897.0, | |
| "text": "three tokens whereas our hello is a" | |
| }, | |
| { | |
| "start": 6898.76, | |
| "text": "single token and so basically everything" | |
| }, | |
| { | |
| "start": 6900.56, | |
| "text": "is a lot more bloated and diffuse and" | |
| }, | |
| { | |
| "start": 6902.32, | |
| "text": "this is I think partly the reason that" | |
| }, | |
| { | |
| "start": 6904.079, | |
| "text": "the model Works worse on other" | |
| }, | |
| { | |
| "start": 6907.0, | |
| "text": "languages uh coming back why is LM bad" | |
| }, | |
| { | |
| "start": 6910.04, | |
| "text": "at simple arithmetic um that has to do" | |
| }, | |
| { | |
| "start": 6913.159, | |
| "text": "with the tokenization of numbers and so" | |
| }, | |
| { | |
| "start": 6917.36, | |
| "text": "um you'll notice that for example" | |
| }, | |
| { | |
| "start": 6919.079, | |
| "text": "addition is very sort of" | |
| }, | |
| { | |
| "start": 6920.96, | |
| "text": "like uh there's an algorithm that is" | |
| }, | |
| { | |
| "start": 6923.079, | |
| "text": "like character level for doing addition" | |
| }, | |
| { | |
| "start": 6925.719, | |
| "text": "so for example here we would first add" | |
| }, | |
| { | |
| "start": 6927.639, | |
| "text": "the ones and then the tens and then the" | |
| }, | |
| { | |
| "start": 6929.199, | |
| "text": "hundreds you have to refer to specific" | |
| }, | |
| { | |
| "start": 6931.079, | |
| "text": "parts of these digits but uh these" | |
| }, | |
| { | |
| "start": 6934.719, | |
| "text": "numbers are represented completely" | |
| }, | |
| { | |
| "start": 6936.199, | |
| "text": "arbitrarily based on whatever happened" | |
| }, | |
| { | |
| "start": 6937.679, | |
| "text": "to merge or not merge during the" | |
| }, | |
| { | |
| "start": 6939.28, | |
| "text": "tokenization process there's an entire" | |
| }, | |
| { | |
| "start": 6941.44, | |
| "text": "blog post about this that I think is" | |
| }, | |
| { | |
| "start": 6942.84, | |
| "text": "quite good integer tokenization is" | |
| }, | |
| { | |
| "start": 6944.719, | |
| "text": "insane and this person basically" | |
| }, | |
| { | |
| "start": 6946.679, | |
| "text": "systematically explores the tokenization" | |
| }, | |
| { | |
| "start": 6948.719, | |
| "text": "of numbers in I believe this is gpt2 and" | |
| }, | |
| { | |
| "start": 6952.04, | |
| "text": "so they notice that for example for the" | |
| }, | |
| { | |
| "start": 6953.76, | |
| "text": "for um four-digit numbers you can take a" | |
| }, | |
| { | |
| "start": 6957.28, | |
| "text": "look at whether it is uh a single token" | |
| }, | |
| { | |
| "start": 6960.199, | |
| "text": "or whether it is two tokens that is a 1" | |
| }, | |
| { | |
| "start": 6962.119, | |
| "text": "three or a 2 two or a 31 combination and" | |
| }, | |
| { | |
| "start": 6964.92, | |
| "text": "so all the different numbers are all the" | |
| }, | |
| { | |
| "start": 6966.56, | |
| "text": "different combinations and you can" | |
| }, | |
| { | |
| "start": 6968.04, | |
| "text": "imagine this is all completely" | |
| }, | |
| { | |
| "start": 6969.199, | |
| "text": "arbitrarily so and the model" | |
| }, | |
| { | |
| "start": 6971.28, | |
| "text": "unfortunately sometimes sees uh four um" | |
| }, | |
| { | |
| "start": 6974.159, | |
| "text": "a token for for all four digits" | |
| }, | |
| { | |
| "start": 6976.599, | |
| "text": "sometimes for three sometimes for two" | |
| }, | |
| { | |
| "start": 6978.04, | |
| "text": "sometimes for one and it's in an" | |
| }, | |
| { | |
| "start": 6980.0, | |
| "text": "arbitrary uh Manner and so this is" | |
| }, | |
| { | |
| "start": 6982.52, | |
| "text": "definitely a headwind if you will for" | |
| }, | |
| { | |
| "start": 6985.0, | |
| "text": "the language model and it's kind of" | |
| }, | |
| { | |
| "start": 6986.36, | |
| "text": "incredible that it can kind of do it and" | |
| }, | |
| { | |
| "start": 6987.92, | |
| "text": "deal with it but it's also kind of not" | |
| }, | |
| { | |
| "start": 6990.119, | |
| "text": "ideal and so that's why for example we" | |
| }, | |
| { | |
| "start": 6992.0, | |
| "text": "saw that meta when they train the Llama" | |
| }, | |
| { | |
| "start": 6994.199, | |
| "text": "2 algorithm and they use sentence piece" | |
| }, | |
| { | |
| "start": 6996.44, | |
| "text": "they make sure to split up all the um" | |
| }, | |
| { | |
| "start": 6999.52, | |
| "text": "all the digits as an example for uh" | |
| }, | |
| { | |
| "start": 7002.32, | |
| "text": "llama 2 and this is partly to improve a" | |
| }, | |
| { | |
| "start": 7004.88, | |
| "text": "simple arithmetic kind of" | |
| }, | |
| { | |
| "start": 7006.92, | |
| "text": "performance and finally why is gpt2 not" | |
| }, | |
| { | |
| "start": 7010.52, | |
| "text": "as good in Python again this is partly a" | |
| }, | |
| { | |
| "start": 7012.92, | |
| "text": "modeling issue on in the architecture" | |
| }, | |
| { | |
| "start": 7014.88, | |
| "text": "and the data set and the strength of the" | |
| }, | |
| { | |
| "start": 7016.639, | |
| "text": "model but it's also partially" | |
| }, | |
| { | |
| "start": 7018.199, | |
| "text": "tokenization because as we saw here with" | |
| }, | |
| { | |
| "start": 7020.32, | |
| "text": "the simple python example the encoding" | |
| }, | |
| { | |
| "start": 7023.04, | |
| "text": "efficiency of the tokenizer for handling" | |
| }, | |
| { | |
| "start": 7025.199, | |
| "text": "spaces in Python is terrible and every" | |
| }, | |
| { | |
| "start": 7027.36, | |
| "text": "single space is an individual token and" | |
| }, | |
| { | |
| "start": 7029.44, | |
| "text": "this dramatically reduces the context" | |
| }, | |
| { | |
| "start": 7031.079, | |
| "text": "length that the model can attend to" | |
| }, | |
| { | |
| "start": 7032.52, | |
| "text": "cross so that's almost like a" | |
| }, | |
| { | |
| "start": 7034.079, | |
| "text": "tokenization bug for gpd2 and that was" | |
| }, | |
| { | |
| "start": 7036.8, | |
| "text": "later fixed with gp4 okay so here's" | |
| }, | |
| { | |
| "start": 7040.0, | |
| "text": "another fun one my llm abruptly halts" | |
| }, | |
| { | |
| "start": 7042.52, | |
| "text": "when it sees the string end of text so" | |
| }, | |
| { | |
| "start": 7045.28, | |
| "text": "here's um here's a very strange Behavior" | |
| }, | |
| { | |
| "start": 7048.04, | |
| "text": "print a string end of text is what I" | |
| }, | |
| { | |
| "start": 7050.079, | |
| "text": "told jt4 and it says could you please" | |
| }, | |
| { | |
| "start": 7052.239, | |
| "text": "specify the string and I'm I'm telling" | |
| }, | |
| { | |
| "start": 7055.119, | |
| "text": "it give me end of text and it seems like" | |
| }, | |
| { | |
| "start": 7057.159, | |
| "text": "there's an issue it's not seeing end of" | |
| }, | |
| { | |
| "start": 7059.239, | |
| "text": "text and then I give it end of text is" | |
| }, | |
| { | |
| "start": 7061.599, | |
| "text": "the string and then here's a string and" | |
| }, | |
| { | |
| "start": 7064.239, | |
| "text": "then it just doesn't print it so" | |
| }, | |
| { | |
| "start": 7065.84, | |
| "text": "obviously something is breaking here" | |
| }, | |
| { | |
| "start": 7067.119, | |
| "text": "with respect to the handling of the" | |
| }, | |
| { | |
| "start": 7068.32, | |
| "text": "special token and I don't actually know" | |
| }, | |
| { | |
| "start": 7070.199, | |
| "text": "what open ey is doing under the hood" | |
| }, | |
| { | |
| "start": 7072.639, | |
| "text": "here and whether they are potentially" | |
| }, | |
| { | |
| "start": 7074.52, | |
| "text": "parsing this as an um as an actual token" | |
| }, | |
| { | |
| "start": 7078.96, | |
| "text": "instead of this just being uh end of" | |
| }, | |
| { | |
| "start": 7081.159, | |
| "text": "text um as like individual sort of" | |
| }, | |
| { | |
| "start": 7084.599, | |
| "text": "pieces of it without the special token" | |
| }, | |
| { | |
| "start": 7086.44, | |
| "text": "handling logic and so it might be that" | |
| }, | |
| { | |
| "start": 7089.52, | |
| "text": "someone when they're calling do encode" | |
| }, | |
| { | |
| "start": 7091.76, | |
| "text": "uh they are passing in the allowed" | |
| }, | |
| { | |
| "start": 7093.36, | |
| "text": "special and they are allowing end of" | |
| }, | |
| { | |
| "start": 7096.199, | |
| "text": "text as a special character in the user" | |
| }, | |
| { | |
| "start": 7098.36, | |
| "text": "prompt but the user prompt of course is" | |
| }, | |
| { | |
| "start": 7100.84, | |
| "text": "is a sort of um attacker controlled text" | |
| }, | |
| { | |
| "start": 7103.52, | |
| "text": "so you would hope that they don't really" | |
| }, | |
| { | |
| "start": 7105.32, | |
| "text": "parse or use special tokens or you know" | |
| }, | |
| { | |
| "start": 7108.76, | |
| "text": "from that kind of input but it appears" | |
| }, | |
| { | |
| "start": 7110.599, | |
| "text": "that there's something definitely going" | |
| }, | |
| { | |
| "start": 7111.76, | |
| "text": "wrong here and um so your knowledge of" | |
| }, | |
| { | |
| "start": 7114.8, | |
| "text": "these special tokens ends up being in a" | |
| }, | |
| { | |
| "start": 7116.4, | |
| "text": "tax surface potentially and so if you'd" | |
| }, | |
| { | |
| "start": 7118.88, | |
| "text": "like to confuse llms then just um try to" | |
| }, | |
| { | |
| "start": 7123.0, | |
| "text": "give them some special tokens and see if" | |
| }, | |
| { | |
| "start": 7124.32, | |
| "text": "you're breaking something by chance okay" | |
| }, | |
| { | |
| "start": 7126.4, | |
| "text": "so this next one is a really fun one uh" | |
| }, | |
| { | |
| "start": 7129.48, | |
| "text": "the trailing whites space issue so if" | |
| }, | |
| { | |
| "start": 7132.88, | |
| "text": "you come to playground and uh we come" | |
| }, | |
| { | |
| "start": 7136.0, | |
| "text": "here to GPT 3.5 turbo instruct so this" | |
| }, | |
| { | |
| "start": 7138.44, | |
| "text": "is not a chat model this is a completion" | |
| }, | |
| { | |
| "start": 7140.32, | |
| "text": "model so think of it more like it's a" | |
| }, | |
| { | |
| "start": 7142.88, | |
| "text": "lot more closer to a base model it does" | |
| }, | |
| { | |
| "start": 7145.28, | |
| "text": "completion it will continue the token" | |
| }, | |
| { | |
| "start": 7147.599, | |
| "text": "sequence so here's a tagline for ice" | |
| }, | |
| { | |
| "start": 7149.88, | |
| "text": "cream shop and we want to continue the" | |
| }, | |
| { | |
| "start": 7151.639, | |
| "text": "sequence and so we can submit and get a" | |
| }, | |
| { | |
| "start": 7154.239, | |
| "text": "bunch of tokens okay no problem but now" | |
| }, | |
| { | |
| "start": 7158.239, | |
| "text": "suppose I do this but instead of" | |
| }, | |
| { | |
| "start": 7160.84, | |
| "text": "pressing submit here I do here's a" | |
| }, | |
| { | |
| "start": 7163.119, | |
| "text": "tagline for ice cream shop space so I" | |
| }, | |
| { | |
| "start": 7166.0, | |
| "text": "have a space here before I click" | |
| }, | |
| { | |
| "start": 7168.96, | |
| "text": "submit we get a warning your text ends" | |
| }, | |
| { | |
| "start": 7171.84, | |
| "text": "in a trail Ling space which causes worse" | |
| }, | |
| { | |
| "start": 7173.4, | |
| "text": "performance due to how API splits text" | |
| }, | |
| { | |
| "start": 7175.84, | |
| "text": "into tokens so what's happening here it" | |
| }, | |
| { | |
| "start": 7178.239, | |
| "text": "still gave us a uh sort of completion" | |
| }, | |
| { | |
| "start": 7180.56, | |
| "text": "here but let's take a look at what's" | |
| }, | |
| { | |
| "start": 7182.8, | |
| "text": "happening so here's a tagline for an ice" | |
| }, | |
| { | |
| "start": 7184.88, | |
| "text": "cream shop and then what does this look" | |
| }, | |
| { | |
| "start": 7188.679, | |
| "text": "like in the actual actual training data" | |
| }, | |
| { | |
| "start": 7190.159, | |
| "text": "suppose you found the completion in the" | |
| }, | |
| { | |
| "start": 7192.28, | |
| "text": "training document somewhere on the" | |
| }, | |
| { | |
| "start": 7193.56, | |
| "text": "internet and the llm trained on this" | |
| }, | |
| { | |
| "start": 7195.679, | |
| "text": "data so maybe it's something like oh" | |
| }, | |
| { | |
| "start": 7198.32, | |
| "text": "yeah maybe that's the tagline that's a" | |
| }, | |
| { | |
| "start": 7200.4, | |
| "text": "terrible tagline but notice here that" | |
| }, | |
| { | |
| "start": 7202.76, | |
| "text": "when I create o you see that because" | |
| }, | |
| { | |
| "start": 7205.76, | |
| "text": "there's the the space character is" | |
| }, | |
| { | |
| "start": 7207.8, | |
| "text": "always a prefix to these tokens in GPT" | |
| }, | |
| { | |
| "start": 7211.159, | |
| "text": "so it's not an O token it's a space o" | |
| }, | |
| { | |
| "start": 7213.48, | |
| "text": "token the space is part of the O and" | |
| }, | |
| { | |
| "start": 7216.76, | |
| "text": "together they are token 8840 that's" | |
| }, | |
| { | |
| "start": 7219.239, | |
| "text": "that's space o so what's What's" | |
| }, | |
| { | |
| "start": 7221.92, | |
| "text": "Happening Here is that when I just have" | |
| }, | |
| { | |
| "start": 7224.119, | |
| "text": "it like this and I let it complete the" | |
| }, | |
| { | |
| "start": 7227.04, | |
| "text": "next token it can sample the space o" | |
| }, | |
| { | |
| "start": 7230.04, | |
| "text": "token but instead if I have this and I" | |
| }, | |
| { | |
| "start": 7232.599, | |
| "text": "add my space then what I'm doing here" | |
| }, | |
| { | |
| "start": 7234.76, | |
| "text": "when I incode this string is I have" | |
| }, | |
| { | |
| "start": 7237.639, | |
| "text": "basically here's a t line for an ice" | |
| }, | |
| { | |
| "start": 7239.079, | |
| "text": "cream uh shop and this space at the very" | |
| }, | |
| { | |
| "start": 7242.0, | |
| "text": "end becomes a token" | |
| }, | |
| { | |
| "start": 7244.079, | |
| "text": "220 and so we've added token 220 and" | |
| }, | |
| { | |
| "start": 7247.84, | |
| "text": "this token otherwise would be part of" | |
| }, | |
| { | |
| "start": 7249.76, | |
| "text": "the tagline because if there actually is" | |
| }, | |
| { | |
| "start": 7251.88, | |
| "text": "a tagline here so space o is the token" | |
| }, | |
| { | |
| "start": 7255.239, | |
| "text": "and so this is suddenly a of" | |
| }, | |
| { | |
| "start": 7257.32, | |
| "text": "distribution for the model because this" | |
| }, | |
| { | |
| "start": 7259.679, | |
| "text": "space is part of the next token but" | |
| }, | |
| { | |
| "start": 7261.52, | |
| "text": "we're putting it here like this and the" | |
| }, | |
| { | |
| "start": 7264.04, | |
| "text": "model has seen very very little data of" | |
| }, | |
| { | |
| "start": 7267.199, | |
| "text": "actual Space by itself and we're asking" | |
| }, | |
| { | |
| "start": 7270.079, | |
| "text": "it to complete the sequence like add in" | |
| }, | |
| { | |
| "start": 7271.719, | |
| "text": "more tokens but the problem is that" | |
| }, | |
| { | |
| "start": 7273.48, | |
| "text": "we've sort of begun the first token and" | |
| }, | |
| { | |
| "start": 7276.36, | |
| "text": "now it's been split up and now we're out" | |
| }, | |
| { | |
| "start": 7278.76, | |
| "text": "of this distribution and now arbitrary" | |
| }, | |
| { | |
| "start": 7280.76, | |
| "text": "bad things happen and it's just a very" | |
| }, | |
| { | |
| "start": 7283.04, | |
| "text": "rare example for it to see something" | |
| }, | |
| { | |
| "start": 7284.56, | |
| "text": "like that and uh that's why we get the" | |
| }, | |
| { | |
| "start": 7286.92, | |
| "text": "warning so the fundamental issue here is" | |
| }, | |
| { | |
| "start": 7289.119, | |
| "text": "of course that um the llm is on top of" | |
| }, | |
| { | |
| "start": 7292.44, | |
| "text": "these tokens and these tokens are text" | |
| }, | |
| { | |
| "start": 7294.599, | |
| "text": "chunks they're not characters in a way" | |
| }, | |
| { | |
| "start": 7296.56, | |
| "text": "you and I would think of them they are" | |
| }, | |
| { | |
| "start": 7298.199, | |
| "text": "these are the atoms of what the LM is" | |
| }, | |
| { | |
| "start": 7300.36, | |
| "text": "seeing and there's a bunch of weird" | |
| }, | |
| { | |
| "start": 7301.8, | |
| "text": "stuff that comes out of it let's go back" | |
| }, | |
| { | |
| "start": 7303.639, | |
| "text": "to our default cell style I bet you that" | |
| }, | |
| { | |
| "start": 7308.0, | |
| "text": "the model has never in its training set" | |
| }, | |
| { | |
| "start": 7309.96, | |
| "text": "seen default cell sta without Le in" | |
| }, | |
| { | |
| "start": 7314.199, | |
| "text": "there it's always seen this as a single" | |
| }, | |
| { | |
| "start": 7316.599, | |
| "text": "group because uh this is some kind of a" | |
| }, | |
| { | |
| "start": 7319.239, | |
| "text": "function in um I'm guess I don't" | |
| }, | |
| { | |
| "start": 7322.0, | |
| "text": "actually know what this is part of this" | |
| }, | |
| { | |
| "start": 7323.079, | |
| "text": "is some kind of API but I bet you that" | |
| }, | |
| { | |
| "start": 7325.119, | |
| "text": "it's never seen this combination of" | |
| }, | |
| { | |
| "start": 7327.079, | |
| "text": "tokens uh in its training data because" | |
| }, | |
| { | |
| "start": 7330.639, | |
| "text": "or I think it would be extremely rare so" | |
| }, | |
| { | |
| "start": 7332.36, | |
| "text": "I took this and I copy pasted it here" | |
| }, | |
| { | |
| "start": 7334.719, | |
| "text": "and I had I tried to complete from it" | |
| }, | |
| { | |
| "start": 7337.48, | |
| "text": "and the it immediately gave me a big" | |
| }, | |
| { | |
| "start": 7339.199, | |
| "text": "error and it said the model predicted to" | |
| }, | |
| { | |
| "start": 7341.079, | |
| "text": "completion that begins with a stop" | |
| }, | |
| { | |
| "start": 7342.32, | |
| "text": "sequence resulting in no output consider" | |
| }, | |
| { | |
| "start": 7344.159, | |
| "text": "adjusting your prompt or stop sequences" | |
| }, | |
| { | |
| "start": 7346.36, | |
| "text": "so what happened here when I clicked" | |
| }, | |
| { | |
| "start": 7347.639, | |
| "text": "submit is that immediately the model" | |
| }, | |
| { | |
| "start": 7350.199, | |
| "text": "emitted and sort of like end of text" | |
| }, | |
| { | |
| "start": 7352.239, | |
| "text": "token I think or something like that it" | |
| }, | |
| { | |
| "start": 7354.44, | |
| "text": "basically predicted the stop sequence" | |
| }, | |
| { | |
| "start": 7356.44, | |
| "text": "immediately so it had no completion and" | |
| }, | |
| { | |
| "start": 7358.76, | |
| "text": "so this is why I'm getting a warning" | |
| }, | |
| { | |
| "start": 7360.199, | |
| "text": "again because we're off the data" | |
| }, | |
| { | |
| "start": 7362.159, | |
| "text": "distribution and the model is just uh" | |
| }, | |
| { | |
| "start": 7365.119, | |
| "text": "predicting just totally arbitrary things" | |
| }, | |
| { | |
| "start": 7367.639, | |
| "text": "it's just really confused basically this" | |
| }, | |
| { | |
| "start": 7369.44, | |
| "text": "is uh this is giving it brain damage" | |
| }, | |
| { | |
| "start": 7370.92, | |
| "text": "it's never seen this before it's shocked" | |
| }, | |
| { | |
| "start": 7373.32, | |
| "text": "and it's predicting end of text or" | |
| }, | |
| { | |
| "start": 7374.56, | |
| "text": "something I tried it again here and it" | |
| }, | |
| { | |
| "start": 7377.04, | |
| "text": "in this case it completed it but then" | |
| }, | |
| { | |
| "start": 7379.079, | |
| "text": "for some reason this request May violate" | |
| }, | |
| { | |
| "start": 7381.44, | |
| "text": "our usage policies this was" | |
| }, | |
| { | |
| "start": 7383.639, | |
| "text": "flagged um basically something just like" | |
| }, | |
| { | |
| "start": 7386.639, | |
| "text": "goes wrong and there's something like" | |
| }, | |
| { | |
| "start": 7387.679, | |
| "text": "Jank you can just feel the Jank because" | |
| }, | |
| { | |
| "start": 7389.52, | |
| "text": "the model is like extremely unhappy with" | |
| }, | |
| { | |
| "start": 7391.4, | |
| "text": "just this and it doesn't know how to" | |
| }, | |
| { | |
| "start": 7392.96, | |
| "text": "complete it because it's never occurred" | |
| }, | |
| { | |
| "start": 7394.159, | |
| "text": "in training set in a training set it" | |
| }, | |
| { | |
| "start": 7396.199, | |
| "text": "always appears like this and becomes a" | |
| }, | |
| { | |
| "start": 7398.32, | |
| "text": "single token" | |
| }, | |
| { | |
| "start": 7400.04, | |
| "text": "so these kinds of issues where tokens" | |
| }, | |
| { | |
| "start": 7401.96, | |
| "text": "are either you sort of like complete the" | |
| }, | |
| { | |
| "start": 7404.239, | |
| "text": "first character of the next token or you" | |
| }, | |
| { | |
| "start": 7406.76, | |
| "text": "are sort of you have long tokens that" | |
| }, | |
| { | |
| "start": 7408.56, | |
| "text": "you then have just some of the" | |
| }, | |
| { | |
| "start": 7409.8, | |
| "text": "characters off all of these are kind of" | |
| }, | |
| { | |
| "start": 7412.32, | |
| "text": "like issues with partial tokens is how I" | |
| }, | |
| { | |
| "start": 7415.36, | |
| "text": "would describe it and if you actually" | |
| }, | |
| { | |
| "start": 7417.76, | |
| "text": "dig into the T token" | |
| }, | |
| { | |
| "start": 7419.8, | |
| "text": "repository go to the rust code and" | |
| }, | |
| { | |
| "start": 7421.96, | |
| "text": "search for" | |
| }, | |
| { | |
| "start": 7424.159, | |
| "text": "unstable and you'll see um en code" | |
| }, | |
| { | |
| "start": 7427.079, | |
| "text": "unstable native unstable token tokens" | |
| }, | |
| { | |
| "start": 7429.239, | |
| "text": "and a lot of like special case handling" | |
| }, | |
| { | |
| "start": 7431.52, | |
| "text": "none of this stuff about unstable tokens" | |
| }, | |
| { | |
| "start": 7433.4, | |
| "text": "is documented anywhere but there's a ton" | |
| }, | |
| { | |
| "start": 7435.48, | |
| "text": "of code dealing with unstable tokens and" | |
| }, | |
| { | |
| "start": 7438.36, | |
| "text": "unstable tokens is exactly kind of like" | |
| }, | |
| { | |
| "start": 7440.8, | |
| "text": "what I'm describing here what you would" | |
| }, | |
| { | |
| "start": 7442.76, | |
| "text": "like out of a completion API is" | |
| }, | |
| { | |
| "start": 7445.239, | |
| "text": "something a lot more fancy like if we're" | |
| }, | |
| { | |
| "start": 7446.599, | |
| "text": "putting in default cell sta if we're" | |
| }, | |
| { | |
| "start": 7448.96, | |
| "text": "asking for the next token sequence we're" | |
| }, | |
| { | |
| "start": 7450.679, | |
| "text": "not actually trying to append the next" | |
| }, | |
| { | |
| "start": 7452.239, | |
| "text": "token exactly after this list we're" | |
| }, | |
| { | |
| "start": 7454.639, | |
| "text": "actually trying to append we're trying" | |
| }, | |
| { | |
| "start": 7456.48, | |
| "text": "to consider lots of tokens um" | |
| }, | |
| { | |
| "start": 7459.52, | |
| "text": "that if we were or I guess like we're" | |
| }, | |
| { | |
| "start": 7462.159, | |
| "text": "trying to search over characters that if" | |
| }, | |
| { | |
| "start": 7465.76, | |
| "text": "we retened would be of high probability" | |
| }, | |
| { | |
| "start": 7468.159, | |
| "text": "if that makes sense um so that we can" | |
| }, | |
| { | |
| "start": 7470.679, | |
| "text": "actually add a single individual" | |
| }, | |
| { | |
| "start": 7472.32, | |
| "text": "character uh instead of just like adding" | |
| }, | |
| { | |
| "start": 7474.48, | |
| "text": "the next full token that comes after" | |
| }, | |
| { | |
| "start": 7476.679, | |
| "text": "this partial token list so I this is" | |
| }, | |
| { | |
| "start": 7479.36, | |
| "text": "very tricky to describe and I invite you" | |
| }, | |
| { | |
| "start": 7481.32, | |
| "text": "to maybe like look through this it ends" | |
| }, | |
| { | |
| "start": 7483.04, | |
| "text": "up being extremely gnarly and hairy kind" | |
| }, | |
| { | |
| "start": 7484.679, | |
| "text": "of topic it and it comes from" | |
| }, | |
| { | |
| "start": 7486.36, | |
| "text": "tokenization fundamentally so um maybe I" | |
| }, | |
| { | |
| "start": 7489.4, | |
| "text": "can even spend an entire video talking" | |
| }, | |
| { | |
| "start": 7490.8, | |
| "text": "about unstable tokens sometime in the" | |
| }, | |
| { | |
| "start": 7492.119, | |
| "text": "future okay and I'm really saving the" | |
| }, | |
| { | |
| "start": 7494.199, | |
| "text": "best for last my favorite one by far is" | |
| }, | |
| { | |
| "start": 7496.599, | |
| "text": "the solid gold" | |
| }, | |
| { | |
| "start": 7499.199, | |
| "text": "Magikarp and it just okay so this comes" | |
| }, | |
| { | |
| "start": 7501.36, | |
| "text": "from this blog post uh solid gold" | |
| }, | |
| { | |
| "start": 7503.639, | |
| "text": "Magikarp and uh this is um internet" | |
| }, | |
| { | |
| "start": 7507.0, | |
| "text": "famous now for those of us in llms and" | |
| }, | |
| { | |
| "start": 7510.079, | |
| "text": "basically I I would advise you to uh" | |
| }, | |
| { | |
| "start": 7511.84, | |
| "text": "read this block Post in full but" | |
| }, | |
| { | |
| "start": 7513.679, | |
| "text": "basically what this person was doing is" | |
| }, | |
| { | |
| "start": 7516.559, | |
| "text": "this person went to the um" | |
| }, | |
| { | |
| "start": 7519.239, | |
| "text": "token embedding stable and clustered the" | |
| }, | |
| { | |
| "start": 7522.32, | |
| "text": "tokens based on their embedding" | |
| }, | |
| { | |
| "start": 7524.8, | |
| "text": "representation and this person noticed" | |
| }, | |
| { | |
| "start": 7527.28, | |
| "text": "that there's a cluster of tokens that" | |
| }, | |
| { | |
| "start": 7529.239, | |
| "text": "look really strange so there's a cluster" | |
| }, | |
| { | |
| "start": 7531.159, | |
| "text": "here at rot e stream Fame solid gold" | |
| }, | |
| { | |
| "start": 7534.079, | |
| "text": "Magikarp Signet message like really" | |
| }, | |
| { | |
| "start": 7536.0, | |
| "text": "weird tokens in uh basically in this" | |
| }, | |
| { | |
| "start": 7539.96, | |
| "text": "embedding cluster and so what are these" | |
| }, | |
| { | |
| "start": 7542.239, | |
| "text": "tokens and where do they even come from" | |
| }, | |
| { | |
| "start": 7543.679, | |
| "text": "like what is solid gold magikarpet makes" | |
| }, | |
| { | |
| "start": 7545.4, | |
| "text": "no sense and then they found bunch of" | |
| }, | |
| { | |
| "start": 7548.96, | |
| "text": "these" | |
| }, | |
| { | |
| "start": 7550.199, | |
| "text": "tokens and then they notice that" | |
| }, | |
| { | |
| "start": 7552.119, | |
| "text": "actually the plot thickens here because" | |
| }, | |
| { | |
| "start": 7553.559, | |
| "text": "if you ask the model about these tokens" | |
| }, | |
| { | |
| "start": 7556.04, | |
| "text": "like you ask it uh some very benign" | |
| }, | |
| { | |
| "start": 7558.639, | |
| "text": "question like please can you repeat back" | |
| }, | |
| { | |
| "start": 7560.199, | |
| "text": "to me the string sold gold Magikarp uh" | |
| }, | |
| { | |
| "start": 7562.96, | |
| "text": "then you get a variety of basically" | |
| }, | |
| { | |
| "start": 7564.8, | |
| "text": "totally broken llm Behavior so either" | |
| }, | |
| { | |
| "start": 7567.76, | |
| "text": "you get evasion so I'm sorry I can't" | |
| }, | |
| { | |
| "start": 7569.84, | |
| "text": "hear you or you get a bunch of" | |
| }, | |
| { | |
| "start": 7571.4, | |
| "text": "hallucinations as a response um you can" | |
| }, | |
| { | |
| "start": 7574.559, | |
| "text": "even get back like insults so you ask it" | |
| }, | |
| { | |
| "start": 7577.28, | |
| "text": "uh about streamer bot it uh tells the" | |
| }, | |
| { | |
| "start": 7580.0, | |
| "text": "and the model actually just calls you" | |
| }, | |
| { | |
| "start": 7582.04, | |
| "text": "names uh or it kind of comes up with" | |
| }, | |
| { | |
| "start": 7584.159, | |
| "text": "like weird humor like you're actually" | |
| }, | |
| { | |
| "start": 7586.239, | |
| "text": "breaking the model by asking about these" | |
| }, | |
| { | |
| "start": 7588.48, | |
| "text": "very simple strings like at Roth and" | |
| }, | |
| { | |
| "start": 7590.52, | |
| "text": "sold gold Magikarp so like what the hell" | |
| }, | |
| { | |
| "start": 7592.84, | |
| "text": "is happening and there's a variety of" | |
| }, | |
| { | |
| "start": 7594.48, | |
| "text": "here documented behaviors uh there's a" | |
| }, | |
| { | |
| "start": 7597.079, | |
| "text": "bunch of tokens not just so good" | |
| }, | |
| { | |
| "start": 7598.48, | |
| "text": "Magikarp that have that kind of a" | |
| }, | |
| { | |
| "start": 7600.28, | |
| "text": "behavior and so basically there's a" | |
| }, | |
| { | |
| "start": 7602.119, | |
| "text": "bunch of like trigger words and if you" | |
| }, | |
| { | |
| "start": 7604.159, | |
| "text": "ask the model about these trigger words" | |
| }, | |
| { | |
| "start": 7606.04, | |
| "text": "or you just include them in your prompt" | |
| }, | |
| { | |
| "start": 7608.04, | |
| "text": "the model goes haywire and has all kinds" | |
| }, | |
| { | |
| "start": 7610.0, | |
| "text": "of uh really Strange Behaviors including" | |
| }, | |
| { | |
| "start": 7612.8, | |
| "text": "sort of ones that violate typical safety" | |
| }, | |
| { | |
| "start": 7614.84, | |
| "text": "guidelines uh and the alignment of the" | |
| }, | |
| { | |
| "start": 7617.0, | |
| "text": "model like it's swearing back at you so" | |
| }, | |
| { | |
| "start": 7619.84, | |
| "text": "what is happening here and how can this" | |
| }, | |
| { | |
| "start": 7621.76, | |
| "text": "possibly be true well this again comes" | |
| }, | |
| { | |
| "start": 7624.559, | |
| "text": "down to tokenization so what's happening" | |
| }, | |
| { | |
| "start": 7626.719, | |
| "text": "here is that sold gold Magikarp if you" | |
| }, | |
| { | |
| "start": 7628.76, | |
| "text": "actually dig into it is a Reddit user so" | |
| }, | |
| { | |
| "start": 7631.719, | |
| "text": "there's a u Sol gold" | |
| }, | |
| { | |
| "start": 7634.04, | |
| "text": "Magikarp and probably what happened here" | |
| }, | |
| { | |
| "start": 7636.8, | |
| "text": "even though I I don't know that this has" | |
| }, | |
| { | |
| "start": 7638.0, | |
| "text": "been like really definitively explored" | |
| }, | |
| { | |
| "start": 7640.44, | |
| "text": "but what is thought to have happened is" | |
| }, | |
| { | |
| "start": 7643.159, | |
| "text": "that the tokenization data set was very" | |
| }, | |
| { | |
| "start": 7645.559, | |
| "text": "different from the training data set for" | |
| }, | |
| { | |
| "start": 7648.0, | |
| "text": "the actual language model so in the" | |
| }, | |
| { | |
| "start": 7649.92, | |
| "text": "tokenization data set there was a ton of" | |
| }, | |
| { | |
| "start": 7651.52, | |
| "text": "redded data potentially where the user" | |
| }, | |
| { | |
| "start": 7654.599, | |
| "text": "solid gold Magikarp was mentioned in the" | |
| }, | |
| { | |
| "start": 7656.4, | |
| "text": "text because solid gold Magikarp was a" | |
| }, | |
| { | |
| "start": 7659.199, | |
| "text": "very common um sort of uh person who" | |
| }, | |
| { | |
| "start": 7661.679, | |
| "text": "would post a lot uh this would be a" | |
| }, | |
| { | |
| "start": 7663.679, | |
| "text": "string that occurs many times in a" | |
| }, | |
| { | |
| "start": 7665.28, | |
| "text": "tokenization data set because it occurs" | |
| }, | |
| { | |
| "start": 7668.0, | |
| "text": "many times in a tokenization data set" | |
| }, | |
| { | |
| "start": 7670.0, | |
| "text": "these tokens would end up getting merged" | |
| }, | |
| { | |
| "start": 7671.48, | |
| "text": "to the single individual token for that" | |
| }, | |
| { | |
| "start": 7673.52, | |
| "text": "single Reddit user sold gold Magikarp so" | |
| }, | |
| { | |
| "start": 7676.4, | |
| "text": "they would have a dedicated token in a" | |
| }, | |
| { | |
| "start": 7678.36, | |
| "text": "vocabulary of was it 50,000 tokens in" | |
| }, | |
| { | |
| "start": 7680.719, | |
| "text": "gpd2 that is devoted to that Reddit user" | |
| }, | |
| { | |
| "start": 7684.119, | |
| "text": "and then what happens is the" | |
| }, | |
| { | |
| "start": 7685.599, | |
| "text": "tokenization data set has those strings" | |
| }, | |
| { | |
| "start": 7688.599, | |
| "text": "but then later when you train the model" | |
| }, | |
| { | |
| "start": 7690.92, | |
| "text": "the language model itself um this data" | |
| }, | |
| { | |
| "start": 7693.92, | |
| "text": "from Reddit was not present and so" | |
| }, | |
| { | |
| "start": 7696.679, | |
| "text": "therefore in the entire training set for" | |
| }, | |
| { | |
| "start": 7698.8, | |
| "text": "the language model sold gold Magikarp" | |
| }, | |
| { | |
| "start": 7701.28, | |
| "text": "never occurs that token never appears in" | |
| }, | |
| { | |
| "start": 7704.32, | |
| "text": "the training set for the actual language" | |
| }, | |
| { | |
| "start": 7705.84, | |
| "text": "model later so this token never gets" | |
| }, | |
| { | |
| "start": 7708.92, | |
| "text": "activated it's initialized at random in" | |
| }, | |
| { | |
| "start": 7711.04, | |
| "text": "the beginning of optimization then you" | |
| }, | |
| { | |
| "start": 7712.88, | |
| "text": "have forward backward passes and updates" | |
| }, | |
| { | |
| "start": 7714.48, | |
| "text": "to the model and this token is just" | |
| }, | |
| { | |
| "start": 7716.0, | |
| "text": "never updated in the embedding table" | |
| }, | |
| { | |
| "start": 7717.92, | |
| "text": "that row Vector never gets sampled it" | |
| }, | |
| { | |
| "start": 7720.0, | |
| "text": "never gets used so it never gets trained" | |
| }, | |
| { | |
| "start": 7722.04, | |
| "text": "and it's completely untrained it's kind" | |
| }, | |
| { | |
| "start": 7723.88, | |
| "text": "of like unallocated memory in a typical" | |
| }, | |
| { | |
| "start": 7726.4, | |
| "text": "binary program written in C or something" | |
| }, | |
| { | |
| "start": 7728.159, | |
| "text": "like that that so it's unallocated" | |
| }, | |
| { | |
| "start": 7730.0, | |
| "text": "memory and then at test time if you" | |
| }, | |
| { | |
| "start": 7731.84, | |
| "text": "evoke this token then you're basically" | |
| }, | |
| { | |
| "start": 7734.28, | |
| "text": "plucking out a row of the embedding" | |
| }, | |
| { | |
| "start": 7735.639, | |
| "text": "table that is completely untrained and" | |
| }, | |
| { | |
| "start": 7737.32, | |
| "text": "that feeds into a Transformer and" | |
| }, | |
| { | |
| "start": 7738.92, | |
| "text": "creates undefined behavior and that's" | |
| }, | |
| { | |
| "start": 7740.96, | |
| "text": "what we're seeing here this completely" | |
| }, | |
| { | |
| "start": 7742.159, | |
| "text": "undefined never before seen in a" | |
| }, | |
| { | |
| "start": 7743.88, | |
| "text": "training behavior and so any of these" | |
| }, | |
| { | |
| "start": 7746.559, | |
| "text": "kind of like weird tokens would evoke" | |
| }, | |
| { | |
| "start": 7748.0, | |
| "text": "this Behavior because fundamentally the" | |
| }, | |
| { | |
| "start": 7749.32, | |
| "text": "model is um is uh uh out of sample out" | |
| }, | |
| { | |
| "start": 7754.48, | |
| "text": "of distribution okay and the very last" | |
| }, | |
| { | |
| "start": 7756.76, | |
| "text": "thing I wanted to just briefly mention" | |
| }, | |
| { | |
| "start": 7758.52, | |
| "text": "point out although I think a lot of" | |
| }, | |
| { | |
| "start": 7759.679, | |
| "text": "people are quite aware of this is that" | |
| }, | |
| { | |
| "start": 7761.639, | |
| "text": "different kinds of formats and different" | |
| }, | |
| { | |
| "start": 7763.159, | |
| "text": "representations and different languages" | |
| }, | |
| { | |
| "start": 7765.0, | |
| "text": "and so on might be more or less" | |
| }, | |
| { | |
| "start": 7766.88, | |
| "text": "efficient with GPD tokenizers uh or any" | |
| }, | |
| { | |
| "start": 7769.8, | |
| "text": "tokenizers for any other L for that" | |
| }, | |
| { | |
| "start": 7771.4, | |
| "text": "matter so for example Json is actually" | |
| }, | |
| { | |
| "start": 7773.559, | |
| "text": "really dense in tokens and yaml is a lot" | |
| }, | |
| { | |
| "start": 7776.32, | |
| "text": "more efficient in tokens um so for" | |
| }, | |
| { | |
| "start": 7779.239, | |
| "text": "example this are these are the same in" | |
| }, | |
| { | |
| "start": 7781.32, | |
| "text": "Json and in yaml the Json is" | |
| }, | |
| { | |
| "start": 7784.599, | |
| "text": "116 and the yaml is 99 so quite a bit of" | |
| }, | |
| { | |
| "start": 7788.119, | |
| "text": "an Improvement and so in the token" | |
| }, | |
| { | |
| "start": 7791.639, | |
| "text": "economy where we are paying uh per token" | |
| }, | |
| { | |
| "start": 7793.639, | |
| "text": "in many ways and you are paying in the" | |
| }, | |
| { | |
| "start": 7795.679, | |
| "text": "context length and you're paying in um" | |
| }, | |
| { | |
| "start": 7797.639, | |
| "text": "dollar amount for uh the cost of" | |
| }, | |
| { | |
| "start": 7799.88, | |
| "text": "processing all this kind of structured" | |
| }, | |
| { | |
| "start": 7801.199, | |
| "text": "data when you have to um so prefer to" | |
| }, | |
| { | |
| "start": 7803.52, | |
| "text": "use theal over Json and in general kind" | |
| }, | |
| { | |
| "start": 7806.079, | |
| "text": "of like the tokenization density is" | |
| }, | |
| { | |
| "start": 7807.599, | |
| "text": "something that you have to um sort of" | |
| }, | |
| { | |
| "start": 7809.84, | |
| "text": "care about and worry about at all times" | |
| }, | |
| { | |
| "start": 7811.679, | |
| "text": "and try to find efficient encoding" | |
| }, | |
| { | |
| "start": 7813.4, | |
| "text": "schemes and spend a lot of time in tick" | |
| }, | |
| { | |
| "start": 7815.4, | |
| "text": "tokenizer and measure the different" | |
| }, | |
| { | |
| "start": 7816.88, | |
| "text": "token efficiencies of different formats" | |
| }, | |
| { | |
| "start": 7818.92, | |
| "text": "and settings and so on okay so that" | |
| }, | |
| { | |
| "start": 7821.0, | |
| "text": "concludes my fairly long video on" | |
| }, | |
| { | |
| "start": 7823.36, | |
| "text": "tokenization I know it's a try I know" | |
| }, | |
| { | |
| "start": 7825.96, | |
| "text": "it's annoying I know it's irritating I" | |
| }, | |
| { | |
| "start": 7828.44, | |
| "text": "personally really dislike the stage what" | |
| }, | |
| { | |
| "start": 7830.88, | |
| "text": "I do have to say at this point is don't" | |
| }, | |
| { | |
| "start": 7832.599, | |
| "text": "brush it off there's a lot of foot guns" | |
| }, | |
| { | |
| "start": 7834.96, | |
| "text": "sharp edges here security issues uh AI" | |
| }, | |
| { | |
| "start": 7838.119, | |
| "text": "safety issues as we saw plugging in" | |
| }, | |
| { | |
| "start": 7839.88, | |
| "text": "unallocated memory into uh language" | |
| }, | |
| { | |
| "start": 7842.079, | |
| "text": "models so um it's worth understanding" | |
| }, | |
| { | |
| "start": 7845.159, | |
| "text": "this stage um that said I will say that" | |
| }, | |
| { | |
| "start": 7848.48, | |
| "text": "eternal glory goes to anyone who can get" | |
| }, | |
| { | |
| "start": 7850.32, | |
| "text": "rid of it uh I showed you one possible" | |
| }, | |
| { | |
| "start": 7852.559, | |
| "text": "paper that tried to uh do that and I" | |
| }, | |
| { | |
| "start": 7854.679, | |
| "text": "think I hope a lot more can follow over" | |
| }, | |
| { | |
| "start": 7857.04, | |
| "text": "time and my final recommendations for" | |
| }, | |
| { | |
| "start": 7859.4, | |
| "text": "the application right now are if you can" | |
| }, | |
| { | |
| "start": 7861.44, | |
| "text": "reuse the GPT 4 tokens and the" | |
| }, | |
| { | |
| "start": 7863.04, | |
| "text": "vocabulary uh in your application then" | |
| }, | |
| { | |
| "start": 7865.0, | |
| "text": "that's something you should consider and" | |
| }, | |
| { | |
| "start": 7866.199, | |
| "text": "just use Tech token because it is very" | |
| }, | |
| { | |
| "start": 7867.84, | |
| "text": "efficient and nice library for inference" | |
| }, | |
| { | |
| "start": 7871.239, | |
| "text": "for bpe I also really like the bite" | |
| }, | |
| { | |
| "start": 7873.719, | |
| "text": "level BP that uh Tik toen and openi uses" | |
| }, | |
| { | |
| "start": 7877.32, | |
| "text": "uh if you for some reason want to train" | |
| }, | |
| { | |
| "start": 7879.04, | |
| "text": "your own vocabulary from scratch um then" | |
| }, | |
| { | |
| "start": 7882.679, | |
| "text": "I would use uh the bpe with sentence" | |
| }, | |
| { | |
| "start": 7885.0, | |
| "text": "piece um oops as I mentioned I'm not a" | |
| }, | |
| { | |
| "start": 7888.119, | |
| "text": "huge fan of sentence piece I don't like" | |
| }, | |
| { | |
| "start": 7890.679, | |
| "text": "its uh bite fallback and I don't like" | |
| }, | |
| { | |
| "start": 7893.92, | |
| "text": "that it's doing BP on unic code code" | |
| }, | |
| { | |
| "start": 7895.559, | |
| "text": "points I think it's uh it also has like" | |
| }, | |
| { | |
| "start": 7897.76, | |
| "text": "a million settings and I think there's a" | |
| }, | |
| { | |
| "start": 7899.119, | |
| "text": "lot of foot gonss here and I think it's" | |
| }, | |
| { | |
| "start": 7900.4, | |
| "text": "really easy to Mis calibrate them and" | |
| }, | |
| { | |
| "start": 7902.199, | |
| "text": "you end up cropping your sentences or" | |
| }, | |
| { | |
| "start": 7903.76, | |
| "text": "something like that uh because of some" | |
| }, | |
| { | |
| "start": 7905.8, | |
| "text": "type of parameter that you don't fully" | |
| }, | |
| { | |
| "start": 7907.28, | |
| "text": "understand so so be very careful with" | |
| }, | |
| { | |
| "start": 7909.44, | |
| "text": "the settings try to copy paste exactly" | |
| }, | |
| { | |
| "start": 7911.719, | |
| "text": "maybe where what meta did or basically" | |
| }, | |
| { | |
| "start": 7914.28, | |
| "text": "spend a lot of time looking at all the" | |
| }, | |
| { | |
| "start": 7916.119, | |
| "text": "hyper parameters and go through the code" | |
| }, | |
| { | |
| "start": 7917.48, | |
| "text": "of sentence piece and make sure that you" | |
| }, | |
| { | |
| "start": 7919.079, | |
| "text": "have this correct um but even if you" | |
| }, | |
| { | |
| "start": 7922.04, | |
| "text": "have all the settings correct I still" | |
| }, | |
| { | |
| "start": 7923.48, | |
| "text": "think that the algorithm is kind of" | |
| }, | |
| { | |
| "start": 7924.92, | |
| "text": "inferior to what's happening here and" | |
| }, | |
| { | |
| "start": 7927.679, | |
| "text": "maybe the best if you really need to" | |
| }, | |
| { | |
| "start": 7929.52, | |
| "text": "train your vocabulary maybe the best" | |
| }, | |
| { | |
| "start": 7931.32, | |
| "text": "thing is to just wait for M bpe to" | |
| }, | |
| { | |
| "start": 7933.159, | |
| "text": "becomes as efficient as possible and uh" | |
| }, | |
| { | |
| "start": 7936.84, | |
| "text": "that's something that maybe I hope to" | |
| }, | |
| { | |
| "start": 7938.159, | |
| "text": "work on and at some point maybe we can" | |
| }, | |
| { | |
| "start": 7940.8, | |
| "text": "be training basically really what we" | |
| }, | |
| { | |
| "start": 7942.88, | |
| "text": "want is we want tick token but training" | |
| }, | |
| { | |
| "start": 7944.96, | |
| "text": "code and that is the ideal thing that" | |
| }, | |
| { | |
| "start": 7947.84, | |
| "text": "currently does not exist and MBP is um" | |
| }, | |
| { | |
| "start": 7951.36, | |
| "text": "is in implementation of it but currently" | |
| }, | |
| { | |
| "start": 7953.239, | |
| "text": "it's in Python so that's currently what" | |
| }, | |
| { | |
| "start": 7955.88, | |
| "text": "I have to say for uh tokenization there" | |
| }, | |
| { | |
| "start": 7958.199, | |
| "text": "might be an advanced video that has even" | |
| }, | |
| { | |
| "start": 7960.4, | |
| "text": "drier and even more detailed in the" | |
| }, | |
| { | |
| "start": 7961.92, | |
| "text": "future but for now I think we're going" | |
| }, | |
| { | |
| "start": 7963.639, | |
| "text": "to leave things off here and uh I hope" | |
| }, | |
| { | |
| "start": 7966.76, | |
| "text": "that was helpful bye" | |
| }, | |
| { | |
| "start": 7974.119, | |
| "text": "and uh they increase this contact size" | |
| }, | |
| { | |
| "start": 7976.04, | |
| "text": "from gpt1 of 512 uh to 1024 and GPT 4" | |
| }, | |
| { | |
| "start": 7982.679, | |
| "text": "two the" | |
| }, | |
| { | |
| "start": 7985.44, | |
| "text": "next okay next I would like us to" | |
| }, | |
| { | |
| "start": 7987.639, | |
| "text": "briefly walk through the code from open" | |
| }, | |
| { | |
| "start": 7989.8, | |
| "text": "AI on the gpt2 encoded" | |
| }, | |
| { | |
| "start": 7995.84, | |
| "text": "ATP I'm sorry I'm gonna sneeze" | |
| }, | |
| { | |
| "start": 7999.119, | |
| "text": "and then what's Happening Here" | |
| }, | |
| { | |
| "start": 8001.84, | |
| "text": "is this is a spous layer that I will" | |
| }, | |
| { | |
| "start": 8004.639, | |
| "text": "explain in a" | |
| }, | |
| { | |
| "start": 8006.119, | |
| "text": "bit What's Happening Here" | |
| }, | |
| { | |
| "start": 8013.159, | |
| "text": "is" | |
| } | |
| ] |