| import re |
| import string |
| import numpy as np |
| import torch |
| import unicodedata |
| import nltk |
|
|
| |
| nltk.download('stopwords') |
| from nltk.corpus import stopwords |
| stop_words = set(stopwords.words('russian', 'english')) |
|
|
| def data_preprocessing(text: str) -> str: |
|
|
| text = text.lower() |
| text = text.replace('-', ' ').replace('\n', ' ') |
|
|
| text = re.sub('<.*?>', '', text) |
| text = ''.join([c for c in text if unicodedata.category(c).startswith(('L', 'N', 'Z')) or c == "'"]) |
| text = ' '.join([word for word in text.split() if word.lower() not in stop_words]) |
| text = ' '.join([word for word in text.split() if not word.isdigit()]) |
| return text |
|
|
|
|
| def get_words_by_freq(sorted_words: list, n: int = 10) -> list: |
| return list(filter(lambda x: x[1] > n, sorted_words)) |
|
|
| def padding(review_int: list, seq_len: int) -> np.array: |
|
|
| features = np.zeros((len(review_int), seq_len), dtype = int) |
| for i, review in enumerate(review_int): |
| if len(review) <= seq_len: |
| zeros = list(np.zeros(seq_len - len(review))) |
| new = zeros + review |
| else: |
| new = review[: seq_len] |
| features[i, :] = np.array(new) |
| |
| return features |
|
|
| def preprocess_single_string( |
| input_string: str, |
| seq_len: int, |
| vocab_to_int: dict, |
| verbose : bool = False |
| ) -> torch.tensor: |
|
|
|
|
| preprocessed_string = data_preprocessing(input_string) |
| result_list = [] |
| for word in preprocessed_string.split(): |
| try: |
| result_list.append(vocab_to_int[word]) |
| except KeyError as e: |
| if verbose: |
| print(f'{e}: not in dictionary!') |
| pass |
| result_padded = padding([result_list], seq_len)[0] |
|
|
| return torch.tensor(result_padded) |