Spaces:
Running
Running
File size: 7,376 Bytes
0d375ed 81c1983 0d375ed a3970e4 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from itertools import chain
from models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
import re
import wordninja
from utils import percentage_difference
from nltk import word_tokenize
class ResumeParser():
def __init__(self) -> None:
self.resumeSegmenter = ResumeSegmenter()
self.models = Models()
def get_date_index(self, clean_resume_lines, date):
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
return indexes
#better suited to a utils file
def sort_tokens_table(self, tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
dates_indexes = list(chain.from_iterable(dates_indexes))
dates_indexes = [i + start_index for i in dates_indexes]
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
dates_indexes = set(dates_indexes)
dates_indexes = sorted(list(dates_indexes))
individual_sections = []
for i, index in enumerate(dates_indexes):
section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
if len(section) == 0:
continue
individual_sections.append(section)
return individual_sections
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
_ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
if sections is None:
return None
print(sections)
if section_header not in sections:
return None
start_index = sections[section_header][0]
end_index = sections[section_header][1]
#on the bases dates would be unique
return start_index, end_index
#more of a utils function
def sort_tokens_table(self, tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def format_output(self, keywords, headlines, isWorkExp=True):
data = []
for section in headlines:
extracted_data = {}
paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
extracted_data['description'] = paragraph
recovered_headlines = ' '.join(wordninja.split(paragraph))
if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
extracted_data['description'] = recovered_headlines
for attr in keywords:
result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
if len(result) > 0:
extracted_data[attr[0]] = result
data.append(extracted_data)
return data
def parse_work_history(self, resume_lines, sections):
start_index, end_index = sections['work_and_employment']
text = ' '.join(resume_lines[start_index:end_index])
recovered_text = ' '.join(wordninja.split(text))
work_dates, companies, locations = self.models.get_ner(text, recovered_text)
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
job_positions = entity_dict['job title']
keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
return self.format_output(keywords, single_work_experiences)
def parse_education(self, resume_lines, sections):
start_index, end_index = sections["education_and_training"]
text = ' '.join(resume_lines[start_index:end_index])
dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
degrees = entity_dict['degree']
majors = entity_dict['major']
keywords = [("date", dates), ("major", majors), ("degree", degrees),
("university", universities), ("location", locations)]
output = self.format_output(keywords, single_education_experiences, False)
output = [res for res in output if res]
return output
def parse_basic_info(self,resume_lines, sections):
start_index, end_index = sections["basics_info"]
text = ' '.join(resume_lines[start_index:end_index])
phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
entites = self.models.ner(text)
if len(entites) == 0:
entites = self.models.ner(' '.join(resume_lines))
output = {}
score = 0
for entity in entites:
if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
output['name']= entity['word']
score = entity['score']
email = re.findall(email_pattern, text)
phone = re.findall(phone_pattern, text)
if email == '':
email = re.findall(email_pattern, ' '.join(resume_lines))
if phone == '':
phone = re.findall(phone_pattern, ' '.join(resume_lines))
output['email'] = email[0] if len(email) > 0 else ''
output['phone'] = phone[0] if len(phone) > 0 else ''
return output
def parse(self, resume_lines):
self.resumeSegmenter.resume_segments = {
'objective': [],
'work_and_employment': [],
'education_and_training': [],
'skills': [],
'accomplishments': [],
'misc': []
}
self.resumeSegmenter.resume_indices = []
sections = self.resumeSegmenter.segment(resume_lines)
if sections is None:
return {}
jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
result = {"basic_info":basic_info, "education":education, "work_experience":jobs}
for section in sections.keys():
if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
result[section] =' '.join(wordninja.split(text))
return result |