Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

detailed-resume-parser / resume_parser.py

khrek

Upload 2 files

1ab9160 12 months ago

raw

history blame contribute delete

7.38 kB

	from itertools import chain
	from models import Models
	#from output_model import OutputModel, WorkExperience
	from segmenter import ResumeSegmenter
	from flashtext import KeywordProcessor
	from collections import defaultdict
	import re
	import wordninja
	from utils import percentage_difference
	from nltk import word_tokenize
	class ResumeParser():
	def __init__(self) -> None:
	self.resumeSegmenter = ResumeSegmenter()
	self.models = Models()


	def get_date_index(self, clean_resume_lines, date):
	indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
	return indexes

	#better suited to a utils file
	def sort_tokens_table(self, tokens_data):
	table = {}
	for key, tokens in tokens_data:
	for token in tokens:
	table[token] = key
	return table

	def split_work_exp(self, resume_lines, start_index, end_index, work_dates):

	dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
	dates_indexes = list(chain.from_iterable(dates_indexes))
	dates_indexes = [i + start_index for i in dates_indexes]
	dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
	dates_indexes = set(dates_indexes)
	dates_indexes = sorted(list(dates_indexes))
	individual_sections = []
	for i, index in enumerate(dates_indexes):
	section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
	if len(section) == 0:
	continue
	individual_sections.append(section)

	return individual_sections

	def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
	_ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
	if sections is None:
	return None
	print(sections)
	if section_header not in sections:
	return None
	start_index = sections[section_header][0]
	end_index = sections[section_header][1]
	#on the bases dates would be unique
	return start_index, end_index

	#more of a utils function
	def sort_tokens_table(self, tokens_data):
	table = {}
	for key, tokens in tokens_data:
	for token in tokens:
	table[token] = key
	return table

	def format_output(self, keywords, headlines, isWorkExp=True):
	data = []
	for section in headlines:
	extracted_data = {}

	paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
	extracted_data['description'] = paragraph
	recovered_headlines = ' '.join(wordninja.split(paragraph))
	if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
	extracted_data['description'] = recovered_headlines
	for attr in keywords:
	result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
	if len(result) > 0:
	extracted_data[attr[0]] = result
	data.append(extracted_data)
	return data


	def parse_work_history(self, resume_lines, sections):

	start_index, end_index = sections['work_and_employment']
	text = ' '.join(resume_lines[start_index:end_index])
	recovered_text = ' '.join(wordninja.split(text))
	work_dates, companies, locations = self.models.get_ner(text, recovered_text)
	single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
	entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
	job_positions = entity_dict['job title']
	keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
	return self.format_output(keywords, single_work_experiences)

	def parse_education(self, resume_lines, sections):
	start_index, end_index = sections["education_and_training"]
	text = ' '.join(resume_lines[start_index:end_index])

	dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
	single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
	entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
	degrees = entity_dict['degree']
	majors = entity_dict['major']
	keywords = [("date", dates), ("major", majors), ("degree", degrees),
	("university", universities), ("location", locations)]
	output = self.format_output(keywords, single_education_experiences, False)
	output = [res for res in output if res]

	return output

	def parse_basic_info(self,resume_lines, sections):

	start_index, end_index = sections["basics_info"]
	text = ' '.join(resume_lines[start_index:end_index])
	phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)\|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	entites = self.models.ner(text)
	if len(entites) == 0:
	entites = self.models.ner(' '.join(resume_lines))

	output = {}
	score = 0
	for entity in entites:
	if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
	output['name']= entity['word']
	score = entity['score']

	email = re.findall(email_pattern, text)
	phone = re.findall(phone_pattern, text)
	if email == '':
	email = re.findall(email_pattern, ' '.join(resume_lines))
	if phone == '':
	phone = re.findall(phone_pattern, ' '.join(resume_lines))
	output['email'] = email[0] if len(email) > 0 else ''
	output['phone'] = phone[0] if len(phone) > 0 else ''
	return output

	def parse(self, resume_lines):
	self.resumeSegmenter.resume_segments = {
	'objective': [],
	'work_and_employment': [],
	'education_and_training': [],
	'skills': [],
	'accomplishments': [],
	'misc': []
	}
	self.resumeSegmenter.resume_indices = []
	sections = self.resumeSegmenter.segment(resume_lines)
	if sections is None:
	return {}
	jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
	education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
	basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
	result = {"basic_info":basic_info, "education":education, "work_experience":jobs}
	for section in sections.keys():
	if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
	text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
	result[section] =' '.join(wordninja.split(text))
	return result