Spaces:
Running
Running
from itertools import chain | |
from models import Models | |
#from output_model import OutputModel, WorkExperience | |
from segmenter import ResumeSegmenter | |
from flashtext import KeywordProcessor | |
from collections import defaultdict | |
import re | |
import wordninja | |
from utils import percentage_difference | |
from nltk import word_tokenize | |
class ResumeParser(): | |
def __init__(self) -> None: | |
self.resumeSegmenter = ResumeSegmenter() | |
self.models = Models() | |
def get_date_index(self, clean_resume_lines, date): | |
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line] | |
return indexes | |
#better suited to a utils file | |
def sort_tokens_table(self, tokens_data): | |
table = {} | |
for key, tokens in tokens_data: | |
for token in tokens: | |
table[token] = key | |
return table | |
def split_work_exp(self, resume_lines, start_index, end_index, work_dates): | |
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates] | |
dates_indexes = list(chain.from_iterable(dates_indexes)) | |
dates_indexes = [i + start_index for i in dates_indexes] | |
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index]) | |
dates_indexes = set(dates_indexes) | |
dates_indexes = sorted(list(dates_indexes)) | |
individual_sections = [] | |
for i, index in enumerate(dates_indexes): | |
section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]] | |
if len(section) == 0: | |
continue | |
individual_sections.append(section) | |
return individual_sections | |
def extract_section_text(self, resume_lines, section_header = "work_and_employment"): | |
_ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines) | |
if sections is None: | |
return None | |
print(sections) | |
if section_header not in sections: | |
return None | |
start_index = sections[section_header][0] | |
end_index = sections[section_header][1] | |
#on the bases dates would be unique | |
return start_index, end_index | |
#more of a utils function | |
def sort_tokens_table(self, tokens_data): | |
table = {} | |
for key, tokens in tokens_data: | |
for token in tokens: | |
table[token] = key | |
return table | |
def format_output(self, keywords, headlines, isWorkExp=True): | |
data = [] | |
for section in headlines: | |
extracted_data = {} | |
paragraph = '\n'.join(section) if isWorkExp else ' '.join(section) | |
extracted_data['description'] = paragraph | |
recovered_headlines = ' '.join(wordninja.split(paragraph)) | |
if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50: | |
extracted_data['description'] = recovered_headlines | |
for attr in keywords: | |
result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines])) | |
if len(result) > 0: | |
extracted_data[attr[0]] = result | |
data.append(extracted_data) | |
return data | |
def parse_work_history(self, resume_lines, sections): | |
start_index, end_index = sections['work_and_employment'] | |
text = ' '.join(resume_lines[start_index:end_index]) | |
recovered_text = ' '.join(wordninja.split(text)) | |
work_dates, companies, locations = self.models.get_ner(text, recovered_text) | |
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates) | |
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index])) | |
job_positions = entity_dict['job title'] | |
keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)] | |
return self.format_output(keywords, single_work_experiences) | |
def parse_education(self, resume_lines, sections): | |
start_index, end_index = sections["education_and_training"] | |
text = ' '.join(resume_lines[start_index:end_index]) | |
dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text))) | |
single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates) | |
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index])) | |
degrees = entity_dict['degree'] | |
majors = entity_dict['major'] | |
keywords = [("date", dates), ("major", majors), ("degree", degrees), | |
("university", universities), ("location", locations)] | |
output = self.format_output(keywords, single_education_experiences, False) | |
output = [res for res in output if res] | |
return output | |
def parse_basic_info(self,resume_lines, sections): | |
start_index, end_index = sections["basics_info"] | |
text = ' '.join(resume_lines[start_index:end_index]) | |
phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}' | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
entites = self.models.ner(text) | |
if len(entites) == 0: | |
entites = self.models.ner(' '.join(resume_lines)) | |
output = {} | |
score = 0 | |
for entity in entites: | |
if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']: | |
output['name']= entity['word'] | |
score = entity['score'] | |
email = re.findall(email_pattern, text) | |
phone = re.findall(phone_pattern, text) | |
if email == '': | |
email = re.findall(email_pattern, ' '.join(resume_lines)) | |
if phone == '': | |
phone = re.findall(phone_pattern, ' '.join(resume_lines)) | |
output['email'] = email[0] if len(email) > 0 else '' | |
output['phone'] = phone[0] if len(phone) > 0 else '' | |
return output | |
def parse(self, resume_lines): | |
self.resumeSegmenter.resume_segments = { | |
'objective': [], | |
'work_and_employment': [], | |
'education_and_training': [], | |
'skills': [], | |
'accomplishments': [], | |
'misc': [] | |
} | |
self.resumeSegmenter.resume_indices = [] | |
sections = self.resumeSegmenter.segment(resume_lines) | |
if sections is None: | |
return {} | |
jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {} | |
education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {} | |
basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {} | |
result = {"basic_info":basic_info, "education":education, "work_experience":jobs} | |
for section in sections.keys(): | |
if section not in ['work_and_employment', 'education_and_training', 'basics_info']: | |
text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]]) | |
result[section] =' '.join(wordninja.split(text)) | |
return result |