Spaces:
Sleeping
Sleeping
prasadbobby
commited on
Commit
•
fabc63c
1
Parent(s):
57d4231
Add application file
Browse files- Demo/Interface.png +0 -0
- Demo/Interface_Results.png +0 -0
- Demo/Workflow.png +0 -0
- Models.py +67 -0
- README.md +26 -12
- Resume_scanner.py +22 -0
- __pycache__/Models.cpython-37.pyc +0 -0
- __pycache__/Resume_Scanner.cpython-37.pyc +0 -0
- application.py +72 -0
- requirements.txt +7 -0
Demo/Interface.png
ADDED
Demo/Interface_Results.png
ADDED
Demo/Workflow.png
ADDED
Models.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gensim
|
2 |
+
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from gensim.models.doc2vec import Doc2Vec
|
5 |
+
import nltk
|
6 |
+
from transformers import AutoTokenizer, AutoModel
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
import torch
|
9 |
+
import numpy as np
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
13 |
+
def mean_pooling(model_output, attention_mask):
|
14 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
15 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
16 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
17 |
+
|
18 |
+
|
19 |
+
@st.cache_resource
|
20 |
+
def get_HF_embeddings(sentences):
|
21 |
+
|
22 |
+
# Load model from HuggingFace Hub
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
|
24 |
+
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
|
25 |
+
# Tokenize sentences
|
26 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
|
27 |
+
# Compute token embeddings
|
28 |
+
with torch.no_grad():
|
29 |
+
model_output = model(**encoded_input)
|
30 |
+
# Perform pooling. In this case, max pooling.
|
31 |
+
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
32 |
+
|
33 |
+
# print("Sentence embeddings:")
|
34 |
+
# print(embeddings)
|
35 |
+
return embeddings
|
36 |
+
|
37 |
+
|
38 |
+
@st.cache_data
|
39 |
+
def get_doc2vec_embeddings(JD, text_resume):
|
40 |
+
nltk.download("punkt")
|
41 |
+
data = [JD]
|
42 |
+
resume_embeddings = []
|
43 |
+
|
44 |
+
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
|
45 |
+
#print (tagged_data)
|
46 |
+
|
47 |
+
model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80)
|
48 |
+
model.build_vocab(tagged_data)
|
49 |
+
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
|
50 |
+
JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1))
|
51 |
+
|
52 |
+
for i in text_resume:
|
53 |
+
text = word_tokenize(i.lower())
|
54 |
+
embeddings = model.infer_vector(text)
|
55 |
+
resume_embeddings.append(np.transpose(embeddings.reshape(-1,1)))
|
56 |
+
return (JD_embeddings, resume_embeddings)
|
57 |
+
|
58 |
+
|
59 |
+
def cosine(embeddings1, embeddings2):
|
60 |
+
# get the match percentage
|
61 |
+
score_list = []
|
62 |
+
for i in embeddings1:
|
63 |
+
matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2))
|
64 |
+
matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal
|
65 |
+
print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.")
|
66 |
+
score_list.append(str(matchPercentage[0][0]))
|
67 |
+
return score_list
|
README.md
CHANGED
@@ -1,12 +1,26 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Resume Screening App
|
2 |
+
This app is built for employers looking for candidates against a particular job description. This app looks into outputing a x% percent similarity score given the resume of the candidate and a job description.
|
3 |
+
|
4 |
+
App deployed on [Streamlit Community Cloud](https://soumee2000-applicant-tracking-system-application-tqrpm0.streamlit.app/)
|
5 |
+
|
6 |
+
## Intuition:
|
7 |
+
1. Get [context-aware BERT Embeddings](https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b) or [document doc2vec embeddings](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) for Resume and Job Description.
|
8 |
+
2. [Hugging Face](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens) Library was very useful alongwith doc2vec or nltk
|
9 |
+
3. Get their [cosine similarity](https://developers.google.com/machine-learning/clustering/similarity/measuring-similarity)
|
10 |
+
|
11 |
+
## Workflow:
|
12 |
+
<img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Workflow.png">
|
13 |
+
|
14 |
+
## Interface
|
15 |
+
<img src = "https://github.com/SOUMEE2000/Resume_Scanner/blob/main/Demo/Interface.png" height=400>
|
16 |
+
<img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Interface_Results.png" height = 400 width = 800>
|
17 |
+
|
18 |
+
## Usage
|
19 |
+
|
20 |
+
```
|
21 |
+
pip install -r requirements.txt
|
22 |
+
```
|
23 |
+
**Run**: ``` streamlit run application.py```
|
24 |
+
|
25 |
+
|
26 |
+
|
Resume_scanner.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
from Models import get_HF_embeddings, cosine, get_doc2vec_embeddings
|
4 |
+
|
5 |
+
def compare(resume_texts, JD_text, flag='HuggingFace-BERT'):
|
6 |
+
JD_embeddings = None
|
7 |
+
resume_embeddings = []
|
8 |
+
|
9 |
+
if flag == 'HuggingFace-BERT':
|
10 |
+
if JD_text is not None:
|
11 |
+
JD_embeddings = get_HF_embeddings(JD_text)
|
12 |
+
for resume_text in resume_texts:
|
13 |
+
resume_embeddings.append(get_HF_embeddings(resume_text))
|
14 |
+
|
15 |
+
if JD_embeddings is not None and resume_embeddings is not None:
|
16 |
+
cos_scores = cosine(resume_embeddings, JD_embeddings)
|
17 |
+
return cos_scores
|
18 |
+
|
19 |
+
# Add logic for other flags like 'Doc2Vec' if necessary
|
20 |
+
else:
|
21 |
+
# Handle other cases
|
22 |
+
pass
|
__pycache__/Models.cpython-37.pyc
ADDED
Binary file (2.52 kB). View file
|
|
__pycache__/Resume_Scanner.cpython-37.pyc
ADDED
Binary file (962 Bytes). View file
|
|
application.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import streamlit as st
|
3 |
+
import pdfplumber
|
4 |
+
from Resume_scanner import compare
|
5 |
+
|
6 |
+
|
7 |
+
def extract_pdf_data(file_path):
|
8 |
+
data = ""
|
9 |
+
with pdfplumber.open(file_path) as pdf:
|
10 |
+
for page in pdf.pages:
|
11 |
+
text = page.extract_text()
|
12 |
+
if text:
|
13 |
+
data += text
|
14 |
+
return data
|
15 |
+
|
16 |
+
|
17 |
+
def extract_text_data(file_path):
|
18 |
+
with open(file_path, 'r') as file:
|
19 |
+
data = file.read()
|
20 |
+
return data
|
21 |
+
|
22 |
+
|
23 |
+
# Command-line argument processing
|
24 |
+
if len(sys.argv) > 1:
|
25 |
+
|
26 |
+
if len(sys.argv) == 3:
|
27 |
+
resume_path = sys.argv[1]
|
28 |
+
jd_path = sys.argv[2]
|
29 |
+
|
30 |
+
resume_data = extract_pdf_data(resume_path)
|
31 |
+
jd_data = extract_text_data(jd_path)
|
32 |
+
|
33 |
+
result = compare([resume_data], jd_data, flag='HuggingFace-BERT')
|
34 |
+
|
35 |
+
sys.exit()
|
36 |
+
|
37 |
+
# Sidebar
|
38 |
+
flag = 'HuggingFace-BERT'
|
39 |
+
with st.sidebar:
|
40 |
+
st.markdown('**Which embedding do you want to use**')
|
41 |
+
options = st.selectbox('Which embedding do you want to use',
|
42 |
+
['HuggingFace-BERT', 'Doc2Vec'],
|
43 |
+
label_visibility="collapsed")
|
44 |
+
flag = options
|
45 |
+
|
46 |
+
# Main content
|
47 |
+
tab1, tab2 = st.tabs(["**Home**", "**Results**"])
|
48 |
+
|
49 |
+
# Tab Home
|
50 |
+
with tab1:
|
51 |
+
st.title("Applicant Tracking System")
|
52 |
+
uploaded_files = st.file_uploader(
|
53 |
+
'**Choose your resume.pdf file:** ', type="pdf", accept_multiple_files=True)
|
54 |
+
JD = st.text_area("**Enter the job description:**")
|
55 |
+
comp_pressed = st.button("Compare!")
|
56 |
+
if comp_pressed and uploaded_files:
|
57 |
+
# Streamlit file_uploader gives file-like objects, not paths
|
58 |
+
uploaded_file_paths = [extract_pdf_data(
|
59 |
+
file) for file in uploaded_files]
|
60 |
+
score = compare(uploaded_file_paths, JD, flag)
|
61 |
+
|
62 |
+
# Tab Results
|
63 |
+
with tab2:
|
64 |
+
st.header("Results")
|
65 |
+
my_dict = {}
|
66 |
+
if comp_pressed and uploaded_files:
|
67 |
+
for i in range(len(score)):
|
68 |
+
my_dict[uploaded_files[i].name] = score[i]
|
69 |
+
sorted_dict = dict(sorted(my_dict.items()))
|
70 |
+
for i in sorted_dict.items():
|
71 |
+
with st.expander(str(i[0])):
|
72 |
+
st.write("Score is: ", i[1])
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
pdfplumber
|
5 |
+
nltk
|
6 |
+
gensim
|
7 |
+
scikit-learn
|