Resume-Ranker-LLM / preprocessing.py
Sudhir878786
completed
9e80f82
import re
import os
import unicodedata
import nltk
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
# download_path = os.path.join(os.getcwd(), 'nltk_packages')
# nltk.data.path.append(download_path)
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode(
'ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
# print(word)
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
# words = replace_numbers(words)
words = remove_stopwords(words)
# words = stem_words(words)
# words = lemmatize_verbs(words)
return words
def preprocess(documents):
preprocessed_documents = []
for document in documents:
tokens = nltk.word_tokenize(document)
preprocessed = normalize(tokens)
preprocessed = ' '.join(map(str, preprocessed))
preprocessed_documents.append(preprocessed)
return preprocessed_documents