add tokenizer
Browse files- special_tokens_map.json +1 -1
- tokenizer.json +13 -13
- tokenizer_config.json +1 -1
special_tokens_map.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{
|
|
|
|
| 1 |
+
{}
|
tokenizer.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": {
|
| 4 |
"direction": "Right",
|
| 5 |
-
"max_length":
|
| 6 |
"strategy": "LongestFirst",
|
| 7 |
"stride": 0
|
| 8 |
},
|
|
@@ -66,7 +66,7 @@
|
|
| 66 |
"clean_text": true,
|
| 67 |
"handle_chinese_chars": true,
|
| 68 |
"strip_accents": null,
|
| 69 |
-
"lowercase":
|
| 70 |
},
|
| 71 |
"pre_tokenizer": {
|
| 72 |
"type": "WhitespaceSplit"
|
|
@@ -183,17 +183,17 @@
|
|
| 183 |
"r": 56,
|
| 184 |
"s": 57,
|
| 185 |
"##\"": 58,
|
| 186 |
-
"##
|
| 187 |
-
"##
|
| 188 |
-
"##
|
| 189 |
-
"##
|
| 190 |
-
"##
|
| 191 |
-
"##
|
| 192 |
-
"##
|
| 193 |
-
"##
|
| 194 |
-
"##
|
| 195 |
-
"##
|
| 196 |
-
"##
|
| 197 |
}
|
| 198 |
}
|
| 199 |
}
|
|
|
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": {
|
| 4 |
"direction": "Right",
|
| 5 |
+
"max_length": 512,
|
| 6 |
"strategy": "LongestFirst",
|
| 7 |
"stride": 0
|
| 8 |
},
|
|
|
|
| 66 |
"clean_text": true,
|
| 67 |
"handle_chinese_chars": true,
|
| 68 |
"strip_accents": null,
|
| 69 |
+
"lowercase": false
|
| 70 |
},
|
| 71 |
"pre_tokenizer": {
|
| 72 |
"type": "WhitespaceSplit"
|
|
|
|
| 183 |
"r": 56,
|
| 184 |
"s": 57,
|
| 185 |
"##\"": 58,
|
| 186 |
+
"##S": 59,
|
| 187 |
+
"##E": 60,
|
| 188 |
+
"##P": 61,
|
| 189 |
+
"##]": 62,
|
| 190 |
+
"##C": 63,
|
| 191 |
+
"##O": 64,
|
| 192 |
+
"##F": 65,
|
| 193 |
+
"##[": 66,
|
| 194 |
+
"##N": 67,
|
| 195 |
+
"##B": 68,
|
| 196 |
+
"##c": 69
|
| 197 |
}
|
| 198 |
}
|
| 199 |
}
|
tokenizer_config.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"
|
|
|
|
| 1 |
+
{"tokenizer_class": "PreTrainedTokenizerFast"}
|