initial commit
Browse files- .idea/.gitignore +8 -0
- .idea/inspectionProfiles/Project_Default.xml +10 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/llm_learn.iml +10 -0
- .idea/misc.xml +7 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- demo.ipynb +77 -0
- demo.py +4 -0
.idea/.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Default ignored files
|
| 2 |
+
/shelf/
|
| 3 |
+
/workspace.xml
|
| 4 |
+
# Editor-based HTTP Client requests
|
| 5 |
+
/httpRequests/
|
| 6 |
+
# Datasource local storage ignored files
|
| 7 |
+
/dataSources/
|
| 8 |
+
/dataSources.local.xml
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<profile version="1.0">
|
| 3 |
+
<option name="myName" value="Project Default" />
|
| 4 |
+
<inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
| 5 |
+
<Languages>
|
| 6 |
+
<language minSize="533" name="Python" />
|
| 7 |
+
</Languages>
|
| 8 |
+
</inspection_tool>
|
| 9 |
+
</profile>
|
| 10 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<settings>
|
| 3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
| 4 |
+
<version value="1.0" />
|
| 5 |
+
</settings>
|
| 6 |
+
</component>
|
.idea/llm_learn.iml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<module type="PYTHON_MODULE" version="4">
|
| 3 |
+
<component name="NewModuleRootManager">
|
| 4 |
+
<content url="file://$MODULE_DIR$">
|
| 5 |
+
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
| 6 |
+
</content>
|
| 7 |
+
<orderEntry type="jdk" jdkName="pytorch" jdkType="Python SDK" />
|
| 8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
| 9 |
+
</component>
|
| 10 |
+
</module>
|
.idea/misc.xml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="Black">
|
| 4 |
+
<option name="sdkName" value="pytorch" />
|
| 5 |
+
</component>
|
| 6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="pytorch" project-jdk-type="Python SDK" />
|
| 7 |
+
</project>
|
.idea/modules.xml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ProjectModuleManager">
|
| 4 |
+
<modules>
|
| 5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/llm_learn.iml" filepath="$PROJECT_DIR$/.idea/llm_learn.iml" />
|
| 6 |
+
</modules>
|
| 7 |
+
</component>
|
| 8 |
+
</project>
|
.idea/vcs.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings">
|
| 4 |
+
<mapping directory="$PROJECT_DIR$/llm_learn" vcs="Git" />
|
| 5 |
+
</component>
|
| 6 |
+
</project>
|
demo.ipynb
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"id": "initial_id",
|
| 6 |
+
"metadata": {
|
| 7 |
+
"collapsed": true,
|
| 8 |
+
"jupyter": {
|
| 9 |
+
"is_executing": true
|
| 10 |
+
}
|
| 11 |
+
},
|
| 12 |
+
"source": [
|
| 13 |
+
"#第2章/加载编码工具\n",
|
| 14 |
+
"from transformers import BertTokenizer\n",
|
| 15 |
+
"tokenizer = BertTokenizer.from_pretrained(\n",
|
| 16 |
+
"pretrained_model_name_or_path='bert-base-chinese',\n",
|
| 17 |
+
"cache_dir=None,\n",
|
| 18 |
+
"force_download=False,\n",
|
| 19 |
+
")\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"#第2章/准备实验数据\n",
|
| 22 |
+
"sents = [\n",
|
| 23 |
+
"'你站在桥上看风景',\n",
|
| 24 |
+
"'看风景的人在楼上看你',\n",
|
| 25 |
+
"'明月装饰了你的窗子',\n",
|
| 26 |
+
"'你装饰了别人的梦',\n",
|
| 27 |
+
"]\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"#第2章/基本的编码函数\n",
|
| 30 |
+
"out = tokenizer.encode(\n",
|
| 31 |
+
"text=sents[0],\n",
|
| 32 |
+
"text_pair=sents[1],\n",
|
| 33 |
+
"#当句子长度大于max_length时截断\n",
|
| 34 |
+
"truncation=True,\n",
|
| 35 |
+
"#一律补PAD,直到max_length长度\n",
|
| 36 |
+
"padding='max_length',\n",
|
| 37 |
+
"add_special_tokens=True,\n",
|
| 38 |
+
"max_length=25,\n",
|
| 39 |
+
"return_tensors=None,\n",
|
| 40 |
+
")\n",
|
| 41 |
+
"print(out)\n",
|
| 42 |
+
"print(tokenizer.decode(out))"
|
| 43 |
+
],
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"execution_count": null
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"metadata": {},
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"outputs": [],
|
| 51 |
+
"execution_count": null,
|
| 52 |
+
"source": "",
|
| 53 |
+
"id": "1c21ec1248b8a72a"
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"metadata": {
|
| 57 |
+
"kernelspec": {
|
| 58 |
+
"display_name": "Python 3",
|
| 59 |
+
"language": "python",
|
| 60 |
+
"name": "python3"
|
| 61 |
+
},
|
| 62 |
+
"language_info": {
|
| 63 |
+
"codemirror_mode": {
|
| 64 |
+
"name": "ipython",
|
| 65 |
+
"version": 2
|
| 66 |
+
},
|
| 67 |
+
"file_extension": ".py",
|
| 68 |
+
"mimetype": "text/x-python",
|
| 69 |
+
"name": "python",
|
| 70 |
+
"nbconvert_exporter": "python",
|
| 71 |
+
"pygments_lexer": "ipython2",
|
| 72 |
+
"version": "2.7.6"
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
"nbformat": 4,
|
| 76 |
+
"nbformat_minor": 5
|
| 77 |
+
}
|
demo.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#第3章/加载数据集
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
dataset = load_dataset(path='seamew/ChnSentiCorp')
|
| 4 |
+
dataset
|