AdsClassifier / src /data /preprocessing_utils.py
SoooSlooow's picture
upload src
d1ef404
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import pymorphy2
class DataPreprocessor:
def __init__(self):
nltk.download('stopwords')
self.morph = pymorphy2.MorphAnalyzer()
self.tokenizer = WordPunctTokenizer()
self.punctuation = set(string.punctuation)
self.stopwords_russian = stopwords.words("russian")
self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)
def tokenize_data(self, texts):
tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
return tokens
def lemmatize_tokens_string(self, tokens_string):
new_tokens = []
for token in tokens_string:
if token not in self.stop_tokens:
new_tokens.append(self.morph.parse(token)[0].normal_form)
return new_tokens
def lemmatize_tokens(self, tokens):
for i in range(len(tokens)):
tokens[i] = self.lemmatize_tokens_string(tokens[i])
def preprocess_texts(self, texts):
tokens = self.tokenize_data(texts)
self.lemmatize_tokens(tokens)
return tokens