| | import json |
| | import sentencepiece as spm |
| | from transformers import T5Tokenizer |
| |
|
| |
|
| | |
| | corpus = [] |
| |
|
| | with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out: |
| | with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in: |
| | for i, line in enumerate(f_in): |
| | if i >= 1000000: |
| | break |
| |
|
| | item = json.loads(line) |
| | src = item["transliteration"]["src"] |
| | tgt = item["transliteration"]["tgt"] |
| |
|
| | f_out.write(src + "\n") |
| | f_out.write(tgt + "\n") |
| |
|
| | |
| | spm.SentencePieceTrainer.Train( |
| | input = "src/data/tokeniser_corpus.txt", |
| | model_prefix = "src/tokeniser/dalat5_sp", |
| | vocab_size = 40000, |
| | model_type = "unigram", |
| | character_coverage = 1.0, |
| | max_sentence_length = 8384, |
| | pad_id = 0, |
| | unk_id = 1, |
| | bos_id = 2, |
| | eos_id = 3, |
| | user_defined_symbols = ["<pad>", "<s>", "</s>"] |
| | ) |
| |
|
| | |
| | tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model") |
| |
|
| | tokenizer.save_pretrained("src/tokeniser/") |