Spaces:
Runtime error
Runtime error
import string | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import WordPunctTokenizer | |
import pymorphy2 | |
class DataPreprocessor: | |
def __init__(self): | |
nltk.download('stopwords') | |
self.morph = pymorphy2.MorphAnalyzer() | |
self.tokenizer = WordPunctTokenizer() | |
self.punctuation = set(string.punctuation) | |
self.stopwords_russian = stopwords.words("russian") | |
self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation) | |
def tokenize_data(self, texts): | |
tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts] | |
return tokens | |
def lemmatize_tokens_string(self, tokens_string): | |
new_tokens = [] | |
for token in tokens_string: | |
if token not in self.stop_tokens: | |
new_tokens.append(self.morph.parse(token)[0].normal_form) | |
return new_tokens | |
def lemmatize_tokens(self, tokens): | |
for i in range(len(tokens)): | |
tokens[i] = self.lemmatize_tokens_string(tokens[i]) | |
def preprocess_texts(self, texts): | |
tokens = self.tokenize_data(texts) | |
self.lemmatize_tokens(tokens) | |
return tokens | |