Spaces:
Runtime error
Runtime error
Sudhir878786
commited on
Commit
β’
9e80f82
1
Parent(s):
fdb6adb
completed
Browse files- .gitignore +5 -0
- README.md +26 -6
- app.py +75 -0
- core.py +37 -0
- demo.py +26 -0
- embedding.py +24 -0
- gradio_app.py +24 -0
- install.sh +12 -0
- main.py +6 -0
- pdf_loader.py +64 -0
- preprocessing.py +107 -0
- requirements.txt +11 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
venv
|
3 |
+
nltk_packages
|
4 |
+
embedding
|
5 |
+
documents
|
README.md
CHANGED
@@ -1,13 +1,33 @@
|
|
1 |
---
|
2 |
-
title: Resume
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Resume Ranking
|
3 |
+
emoji: π
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: indigo
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
+
# resume-ranker
|
13 |
+
<hr>
|
14 |
+
|
15 |
+
## How to Use?
|
16 |
+
|
17 |
+
Install all the dependencies with:
|
18 |
+
|
19 |
+
```bash
|
20 |
+
./install.sh
|
21 |
+
```
|
22 |
+
|
23 |
+
Run the Streamlilt with:
|
24 |
+
|
25 |
+
```bash
|
26 |
+
streamlit run app.py
|
27 |
+
```
|
28 |
+
|
29 |
+
Or run it from with:
|
30 |
+
|
31 |
+
```bash
|
32 |
+
python demo.py
|
33 |
+
```
|
app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pdf_loader import load_btyes_io
|
3 |
+
from core import pipeline
|
4 |
+
|
5 |
+
# Developer Details
|
6 |
+
developer_details = {
|
7 |
+
"Sudhir Sharma": {
|
8 |
+
"Education": "B.Tech in Computer Science and Engineering, IIT Bhilai, 2024",
|
9 |
+
"Email": "[email protected]",
|
10 |
+
"GitHub": "[GitHub Profile](https://github.com/Sudhir878786)",
|
11 |
+
"LinkedIn": "[LinkedIn Profile](https://www.linkedin.com/in/sudhirsharma87/)"
|
12 |
+
}
|
13 |
+
}
|
14 |
+
|
15 |
+
|
16 |
+
def inference(query, files, embedding_type):
|
17 |
+
|
18 |
+
# pdfReader = PyPDF2.PdfReader(files[0])
|
19 |
+
# text = ''
|
20 |
+
# for page in pdfReader.pages:
|
21 |
+
# text += page.extract_text()
|
22 |
+
# st.write(text)
|
23 |
+
|
24 |
+
results, _ = pipeline(query, load_btyes_io(files), embedding_type=embedding_type)
|
25 |
+
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
26 |
+
return prob_per_documents
|
27 |
+
|
28 |
+
st.sidebar.header("Developer Details")
|
29 |
+
selected_developer = st.sidebar.selectbox("Select a developer", list(developer_details.keys()))
|
30 |
+
st.sidebar.markdown(developer_details[selected_developer]["Education"])
|
31 |
+
st.sidebar.markdown(developer_details[selected_developer]["Email"])
|
32 |
+
st.sidebar.markdown(developer_details[selected_developer]["GitHub"])
|
33 |
+
st.sidebar.markdown(developer_details[selected_developer]["LinkedIn"])
|
34 |
+
|
35 |
+
sample_files = [
|
36 |
+
"documents/business.pdf",
|
37 |
+
"documents/data_science.pdf",
|
38 |
+
]
|
39 |
+
|
40 |
+
sample_job_descriptions = {
|
41 |
+
"Software Engineer": """We are looking for a software engineer with experience in Python and web development. The ideal candidate should have a strong background in building scalable and robust applications. Knowledge of frameworks such as Flask and Django is a plus. Experience with front-end technologies like HTML, CSS, and JavaScript is desirable. The candidate should also have a good understanding of databases and SQL. Strong problem-solving and communication skills are required for this role.
|
42 |
+
""",
|
43 |
+
"Data Scientist": """We are seeking a data scientist with expertise in machine learning and statistical analysis. The candidate should have a solid understanding of data manipulation, feature engineering, and model development. Proficiency in Python and popular data science libraries such as NumPy, Pandas, and Scikit-learn is required. Experience with deep learning frameworks like TensorFlow or PyTorch is a plus. Strong analytical and problem-solving skills are essential for this position.
|
44 |
+
"""
|
45 |
+
}
|
46 |
+
|
47 |
+
st.sidebar.header("Sample Files")
|
48 |
+
for sample_file in sample_files:
|
49 |
+
st.sidebar.markdown(f"[{sample_file}](./sample_files/{sample_file})")
|
50 |
+
|
51 |
+
st.sidebar.header("Sample Job Descriptions")
|
52 |
+
selected_job = st.sidebar.selectbox("Select a job description", list(sample_job_descriptions.keys()))
|
53 |
+
st.sidebar.markdown("```")
|
54 |
+
st.sidebar.code(sample_job_descriptions[selected_job])
|
55 |
+
st.title("π¨πΌβπResume Ranker ")
|
56 |
+
|
57 |
+
query = st.text_area("Job Description", height=200, value=sample_job_descriptions[selected_job])
|
58 |
+
uploaded_files = st.file_uploader("Upload Resume", accept_multiple_files=True, type=["txt", "pdf"])
|
59 |
+
embedding_type = st.selectbox("Embedding Type", ["bert", "minilm", "tfidf"])
|
60 |
+
|
61 |
+
if st.button("Submit"):
|
62 |
+
if not query:
|
63 |
+
st.warning("Please enter a job description.")
|
64 |
+
elif not uploaded_files:
|
65 |
+
st.warning("Please upload one or more resumes.")
|
66 |
+
else:
|
67 |
+
with st.spinner("Processing..."):
|
68 |
+
results = inference(query, uploaded_files,embedding_type)
|
69 |
+
st.subheader("Results")
|
70 |
+
for document, similarity in results.items():
|
71 |
+
# make similiarty round to 2 decimal place
|
72 |
+
if similarity >= 1:
|
73 |
+
similarity = round(similarity, 2)
|
74 |
+
st.write(f"- {document}:")
|
75 |
+
st.progress(similarity, text=f"{similarity:.2%}")
|
core.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from embedding import embedding
|
2 |
+
from preprocessing import preprocess
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import numpy as np
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
|
8 |
+
documents = np.array([doc['content'] for doc in ori_documents])
|
9 |
+
documents = np.insert(documents, 0, input_doc)
|
10 |
+
# st.write(documents)
|
11 |
+
preprocessed_documents = preprocess(documents)
|
12 |
+
# st.write(preprocessed_documents)
|
13 |
+
print("Encoding with BERT...")
|
14 |
+
documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
|
15 |
+
print("Encoding finished")
|
16 |
+
|
17 |
+
#compute cosine similarity
|
18 |
+
pairwise = cosine_similarity(documents_vectors)
|
19 |
+
|
20 |
+
#only retain useful information
|
21 |
+
pairwise = pairwise[0,1:]
|
22 |
+
sorted_idx = np.argsort(pairwise)[::-1]
|
23 |
+
result_pairwise = pairwise[sorted_idx]
|
24 |
+
|
25 |
+
results = []
|
26 |
+
print('Resume ranking:')
|
27 |
+
for idx in sorted_idx:
|
28 |
+
single_result = {
|
29 |
+
'rank': idx,
|
30 |
+
'name': ori_documents[idx]['name'],
|
31 |
+
'similarity': pairwise[idx].item()
|
32 |
+
}
|
33 |
+
results.append(single_result)
|
34 |
+
print(f'Resume of candidite {idx}')
|
35 |
+
print(f'Cosine Similarity: {pairwise[idx]}\n')
|
36 |
+
|
37 |
+
return results, result_pairwise
|
demo.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from core import pipeline
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
pipeline('''About Sleek
|
6 |
+
|
7 |
+
Sleek is on a mission to revolutionize how entrepreneurs operate their business. We want to give small business owners peace of mind and the power of online solutions to allow them to focus on what they do best - growing their business. As we work for our thousands of customers, we gather millions of data points about their business, and in turn we transform those into useful, actionable insights and recommendations to accelerate their growth through smart algorithms.
|
8 |
+
|
9 |
+
We are a team of 400 builders from 17 countries, with offices in Singapore, Philippines, Hong Kong, Australia and the UK committed to delivering a delightful experience to our clients!
|
10 |
+
|
11 |
+
You will be working in the Data & Analytics organization to solve a wide range of business problems leveraging advanced analytics. You will deploy a flexible analytical skill set to deliver insightful data and analysis and model business scenarios. Your principal goal will be to use data to drive better business decisions. This means translating data into meaningful insights and recommendations and, where relevant, proactively implement improvements. You will be developing the business reporting and analysis for our internal operations world-wide. The job will require working closely with the various Business Units to understand their business question as well as the whole data team to understand and access available data.
|
12 |
+
|
13 |
+
Position Duties
|
14 |
+
Drive analytical problem-solving and deep dives. Work with large, complex data sets. Solve difficult, non-routine problems, applying advanced quantitative methods.
|
15 |
+
Collaborate with a wide variety of cross-functional partners to determine business needs, drive analytical projects from start to finish.
|
16 |
+
Align with involved stakeholders to set up dashboards and reports to drive data driven decision across all departments
|
17 |
+
Working very closely with our Data team, Tech and Product team to understand the business logic to generate accurate reports and correct analysis
|
18 |
+
|
19 |
+
Requirements
|
20 |
+
|
21 |
+
Performance Standards
|
22 |
+
Able to commit for a period of at least 4 months
|
23 |
+
Currently pursuing a degree in Business Science, Engineering or relevant disciplines with a focus on data.
|
24 |
+
Good knowledge in SQL, R and Python.
|
25 |
+
Experience in data visualization tools (Tableau, PowerBI, Google DataStudio or equivalent) will be an added advantage.''',
|
26 |
+
load_documents(source_dir = 'documents'))
|
embedding.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import os
|
4 |
+
|
5 |
+
def embedding(documents, embedding='bert'):
|
6 |
+
if embedding == 'bert':
|
7 |
+
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens', cache_folder=os.path.join(os.getcwd(), 'embedding'))
|
8 |
+
|
9 |
+
document_embeddings = sbert_model.encode(documents)
|
10 |
+
return document_embeddings
|
11 |
+
|
12 |
+
if embedding == 'minilm':
|
13 |
+
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=os.path.join(os.getcwd(), 'embedding'))
|
14 |
+
|
15 |
+
document_embeddings = sbert_model.encode(documents)
|
16 |
+
return document_embeddings
|
17 |
+
|
18 |
+
if embedding == 'tfidf':
|
19 |
+
word_vectorizer = TfidfVectorizer(
|
20 |
+
sublinear_tf=True, stop_words='english')
|
21 |
+
word_vectorizer.fit(documents)
|
22 |
+
word_features = word_vectorizer.transform(documents)
|
23 |
+
|
24 |
+
return word_features
|
gradio_app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from core import pipeline
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
def inference(query, files):
|
7 |
+
#get path of uploaded files
|
8 |
+
files = [file.name for file in files]
|
9 |
+
results,_ = pipeline(query, load_documents(file_paths=files))
|
10 |
+
|
11 |
+
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
12 |
+
return prob_per_documents
|
13 |
+
|
14 |
+
with gr.Blocks() as demo:
|
15 |
+
#write a header
|
16 |
+
|
17 |
+
job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
|
18 |
+
files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
|
19 |
+
btn = gr.Button("Submit")
|
20 |
+
output = gr.Label(label="Results")
|
21 |
+
# output = gr.Number(label="Results")
|
22 |
+
btn.click(inference, inputs=[job_desc, files], outputs=output)
|
23 |
+
|
24 |
+
demo.launch()
|
install.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
find . \( -name __pycache__ -o -name "*.pyc" \) -delete
|
3 |
+
python3 -m venv venv
|
4 |
+
# Check the operating system
|
5 |
+
if [[ "$OSTYPE" == "msys" ]]; then
|
6 |
+
# Windows
|
7 |
+
source venv/Scripts/activate
|
8 |
+
else
|
9 |
+
# Unix-like systems (macOS, Linux)
|
10 |
+
source venv/bin/activate
|
11 |
+
fi
|
12 |
+
pip install --no-cache-dir -r requirements.txt
|
main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from fastapi import FastAPI, File, UploadFile
|
2 |
+
# app = FastAPI()
|
3 |
+
|
4 |
+
# @app.post("/resume")
|
5 |
+
# async def root(name:str, email:str, about:str, file:UploadFile = File(...)):
|
6 |
+
# return {"name":name, "email":email, "about":about, "file_name":file.filename}
|
pdf_loader.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import PyPDF2
|
3 |
+
|
4 |
+
def load_single_document(file_path: str):
|
5 |
+
# Loads a single document from file path
|
6 |
+
if file_path[-4:] == '.txt':
|
7 |
+
with open(file_path, 'r') as f:
|
8 |
+
return f.read()
|
9 |
+
|
10 |
+
elif file_path[-4:] == '.pdf':
|
11 |
+
pdfFileObj = open(file_path, 'rb')
|
12 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
13 |
+
text = ''
|
14 |
+
for page in pdfReader.pages:
|
15 |
+
text += page.extract_text()
|
16 |
+
return text
|
17 |
+
|
18 |
+
elif file_path[-4:] == '.csv':
|
19 |
+
with open(file_path, 'r') as f:
|
20 |
+
return f.read()
|
21 |
+
|
22 |
+
else:
|
23 |
+
raise Exception('Invalid file type')
|
24 |
+
|
25 |
+
|
26 |
+
def load_documents(file_paths: list[str] = None, source_dir: str = None):
|
27 |
+
# Loads all documents from source documents directory
|
28 |
+
if file_paths:
|
29 |
+
all_files = file_paths
|
30 |
+
elif source_dir:
|
31 |
+
all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
|
32 |
+
else:
|
33 |
+
raise Exception('No file paths or source directory provided')
|
34 |
+
|
35 |
+
return [
|
36 |
+
{
|
37 |
+
'name': os.path.basename(file_path),
|
38 |
+
'content': load_single_document(f"{file_path}")
|
39 |
+
} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
|
40 |
+
]
|
41 |
+
|
42 |
+
def load_io(file_byte = None):
|
43 |
+
# Loads a single document from file path
|
44 |
+
if file_byte.name[-3:] == 'txt':
|
45 |
+
return file_byte.read().decode("utf-8")
|
46 |
+
|
47 |
+
elif file_byte.name[-3:] == 'pdf':
|
48 |
+
pdfReader = PyPDF2.PdfReader(file_byte)
|
49 |
+
text = ''
|
50 |
+
for page in pdfReader.pages:
|
51 |
+
text += page.extract_text()
|
52 |
+
return text
|
53 |
+
|
54 |
+
else:
|
55 |
+
raise Exception('Invalid file type')
|
56 |
+
|
57 |
+
def load_btyes_io(files = None):
|
58 |
+
|
59 |
+
return [
|
60 |
+
{
|
61 |
+
'name': file_btye.name,
|
62 |
+
'content': load_io(file_btye)
|
63 |
+
} for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
|
64 |
+
]
|
preprocessing.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import unicodedata
|
5 |
+
import nltk
|
6 |
+
import inflect
|
7 |
+
from nltk import word_tokenize, sent_tokenize
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.stem import LancasterStemmer, WordNetLemmatizer
|
10 |
+
|
11 |
+
# download_path = os.path.join(os.getcwd(), 'nltk_packages')
|
12 |
+
# nltk.data.path.append(download_path)
|
13 |
+
nltk.download('wordnet')
|
14 |
+
nltk.download('stopwords')
|
15 |
+
nltk.download('punkt')
|
16 |
+
|
17 |
+
def remove_non_ascii(words):
|
18 |
+
"""Remove non-ASCII characters from list of tokenized words"""
|
19 |
+
new_words = []
|
20 |
+
for word in words:
|
21 |
+
new_word = unicodedata.normalize('NFKD', word).encode(
|
22 |
+
'ascii', 'ignore').decode('utf-8', 'ignore')
|
23 |
+
new_words.append(new_word)
|
24 |
+
return new_words
|
25 |
+
|
26 |
+
|
27 |
+
def to_lowercase(words):
|
28 |
+
"""Convert all characters to lowercase from list of tokenized words"""
|
29 |
+
new_words = []
|
30 |
+
for word in words:
|
31 |
+
new_word = word.lower()
|
32 |
+
new_words.append(new_word)
|
33 |
+
return new_words
|
34 |
+
|
35 |
+
|
36 |
+
def remove_punctuation(words):
|
37 |
+
"""Remove punctuation from list of tokenized words"""
|
38 |
+
new_words = []
|
39 |
+
for word in words:
|
40 |
+
new_word = re.sub(r'[^\w\s]', '', word)
|
41 |
+
if new_word != '':
|
42 |
+
new_words.append(new_word)
|
43 |
+
return new_words
|
44 |
+
|
45 |
+
|
46 |
+
def replace_numbers(words):
|
47 |
+
"""Replace all interger occurrences in list of tokenized words with textual representation"""
|
48 |
+
p = inflect.engine()
|
49 |
+
new_words = []
|
50 |
+
for word in words:
|
51 |
+
if word.isdigit():
|
52 |
+
new_word = p.number_to_words(word)
|
53 |
+
new_words.append(new_word)
|
54 |
+
else:
|
55 |
+
new_words.append(word)
|
56 |
+
return new_words
|
57 |
+
|
58 |
+
|
59 |
+
def remove_stopwords(words):
|
60 |
+
"""Remove stop words from list of tokenized words"""
|
61 |
+
new_words = []
|
62 |
+
for word in words:
|
63 |
+
# print(word)
|
64 |
+
if word not in stopwords.words('english'):
|
65 |
+
new_words.append(word)
|
66 |
+
return new_words
|
67 |
+
|
68 |
+
|
69 |
+
def stem_words(words):
|
70 |
+
"""Stem words in list of tokenized words"""
|
71 |
+
stemmer = LancasterStemmer()
|
72 |
+
stems = []
|
73 |
+
for word in words:
|
74 |
+
stem = stemmer.stem(word)
|
75 |
+
stems.append(stem)
|
76 |
+
return stems
|
77 |
+
|
78 |
+
|
79 |
+
def lemmatize_verbs(words):
|
80 |
+
"""Lemmatize verbs in list of tokenized words"""
|
81 |
+
lemmatizer = WordNetLemmatizer()
|
82 |
+
lemmas = []
|
83 |
+
for word in words:
|
84 |
+
lemma = lemmatizer.lemmatize(word, pos='v')
|
85 |
+
lemmas.append(lemma)
|
86 |
+
return lemmas
|
87 |
+
|
88 |
+
def normalize(words):
|
89 |
+
words = remove_non_ascii(words)
|
90 |
+
words = to_lowercase(words)
|
91 |
+
words = remove_punctuation(words)
|
92 |
+
# words = replace_numbers(words)
|
93 |
+
words = remove_stopwords(words)
|
94 |
+
# words = stem_words(words)
|
95 |
+
# words = lemmatize_verbs(words)
|
96 |
+
return words
|
97 |
+
|
98 |
+
|
99 |
+
def preprocess(documents):
|
100 |
+
preprocessed_documents = []
|
101 |
+
for document in documents:
|
102 |
+
tokens = nltk.word_tokenize(document)
|
103 |
+
preprocessed = normalize(tokens)
|
104 |
+
preprocessed = ' '.join(map(str, preprocessed))
|
105 |
+
preprocessed_documents.append(preprocessed)
|
106 |
+
|
107 |
+
return preprocessed_documents
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
inflect==6.0.4
|
2 |
+
nltk==3.8.1
|
3 |
+
numpy==1.24.3
|
4 |
+
PyPDF2==3.0.1
|
5 |
+
scikit_learn==1.2.2
|
6 |
+
sentence_transformers==2.2.2
|
7 |
+
fastapi
|
8 |
+
uvicorn[standard]
|
9 |
+
python-multipart
|
10 |
+
python-dotenv
|
11 |
+
streamlit
|