nepp1d0 commited on
Commit
a336299
·
1 Parent(s): bdabb1f

add tokenizer

Browse files
Files changed (3) hide show
  1. tokenizer.json +1 -1
  2. tokenizer_config.json +1 -1
  3. vocab.txt +57 -0
tokenizer.json CHANGED
@@ -1 +1 @@
1
- {"version":"1.0","truncation":{"max_length":512,"strategy":"LongestFirst","stride":0},"padding":{"strategy":"BatchLongest","direction":"Right","pad_to_multiple_of":null,"pad_id":0,"pad_type_id":0,"pad_token":"[PAD]"},"added_tokens":[{"id":0,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"WhitespaceSplit"},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}}],"pair":[{"Sequence":{"id":"A","type_id":0}},{"Sequence":{"id":"B","type_id":1}}],"special_tokens":{"[CLS]":{"id":"[CLS]","ids":[2],"tokens":["[CLS]"]},"[SEP]":{"id":"[SEP]","ids":[3],"tokens":["[SEP]"]}}},"decoder":{"type":"WordPiece","prefix":"##","cleanup":true},"model":{"type":"WordPiece","unk_token":"[UNK]","continuing_subword_prefix":"##","max_input_chars_per_word":100,"vocab":{"[UNK]":0,"[PAD]":1,"[CLS]":2,"[SEP]":3,"[MASK]":4,"\"":5,"A":6,"B":7,"C":8,"D":9,"E":10,"F":11,"G":12,"H":13,"I":14,"J":15,"K":16,"L":17,"M":18,"N":19,"O":20,"P":21,"Q":22,"R":23,"S":24,"T":25,"V":26,"W":27,"X":28,"Y":29,"Z":30,"##C":31,"##V":32,"##G":33,"##H":34,"##N":35,"##R":36,"##S":37,"##Y":38,"##Q":39,"##W":40,"##D":41,"##K":42,"##F":43,"##T":44,"##A":45,"##P":46,"##E":47,"##L":48,"##I":49,"##M":50,"##X":51,"##\"":52,"##B":53,"##J":54,"##Z":55,"##O":56}}}
 
1
+ {"version":"1.0","truncation":{"max_length":512,"strategy":"LongestFirst","stride":0},"padding":{"strategy":"BatchLongest","direction":"Right","pad_to_multiple_of":null,"pad_id":0,"pad_type_id":0,"pad_token":"[PAD]"},"added_tokens":[{"id":0,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":{"type":"BertNormalizer","clean_text":true,"handle_chinese_chars":true,"strip_accents":null,"lowercase":true},"pre_tokenizer":{"type":"WhitespaceSplit"},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}}],"pair":[{"Sequence":{"id":"A","type_id":0}},{"Sequence":{"id":"B","type_id":1}}],"special_tokens":{"[CLS]":{"id":"[CLS]","ids":[2],"tokens":["[CLS]"]},"[SEP]":{"id":"[SEP]","ids":[3],"tokens":["[SEP]"]}}},"decoder":{"type":"WordPiece","prefix":"##","cleanup":true},"model":{"type":"WordPiece","unk_token":"[UNK]","continuing_subword_prefix":"##","max_input_chars_per_word":100,"vocab":{"[UNK]":0,"[PAD]":1,"[CLS]":2,"[SEP]":3,"[MASK]":4,"\"":5,"A":6,"B":7,"C":8,"D":9,"E":10,"F":11,"G":12,"H":13,"I":14,"J":15,"K":16,"L":17,"M":18,"N":19,"O":20,"P":21,"Q":22,"R":23,"S":24,"T":25,"V":26,"W":27,"X":28,"Y":29,"Z":30,"##S":31,"##K":32,"##P":33,"##T":34,"##L":35,"##Y":36,"##M":37,"##W":38,"##A":39,"##E":40,"##R":41,"##I":42,"##N":43,"##V":44,"##F":45,"##G":46,"##D":47,"##Q":48,"##H":49,"##C":50,"##X":51,"##B":52,"##\"":53,"##J":54,"##Z":55,"##O":56}}}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"model_max_length": 512, "unk_token": "[UNK]", "pad_token": "[PAD]", "cls_token": "[CLS]", "sep_token": "[SEP]", "mask_token": "[MASK]", "tokenizer_class": "PreTrainedTokenizerFast"}
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "tokenizer_class": "DistilBertTokenizer"}
vocab.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [UNK]
2
+ [PAD]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ "
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+ ##S
33
+ ##K
34
+ ##P
35
+ ##T
36
+ ##L
37
+ ##Y
38
+ ##M
39
+ ##W
40
+ ##A
41
+ ##E
42
+ ##R
43
+ ##I
44
+ ##N
45
+ ##V
46
+ ##F
47
+ ##G
48
+ ##D
49
+ ##Q
50
+ ##H
51
+ ##C
52
+ ##X
53
+ ##B
54
+ ##"
55
+ ##J
56
+ ##Z
57
+ ##O