|
import pandas as pd |
|
import gradio as gr |
|
import re |
|
from datetime import timedelta |
|
|
|
|
|
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()): |
|
|
|
mindbody = load_data(files_mindbody) |
|
medserv = load_data(files_medserv) |
|
|
|
medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True) |
|
mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True) |
|
|
|
|
|
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True) |
|
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True) |
|
|
|
mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y') |
|
|
|
|
|
medserv['DOS'] = medserv['DOS'].astype(str) |
|
medserv['DOS'] = medserv['DOS'].str.split(',') |
|
medserv = medserv.explode('DOS') |
|
|
|
|
|
formats_to_try = ['%d/%m/%Y', '%Y-%m-%d'] |
|
for format_to_try in formats_to_try: |
|
try: |
|
medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try) |
|
break |
|
except ValueError: |
|
continue |
|
|
|
|
|
unmatched_rows = [] |
|
|
|
rows = len(mindbody) |
|
|
|
|
|
for idx in progress.tqdm(range(rows), desc='Analyzing files...'): |
|
|
|
date = mindbody.iloc[idx]['DOS'] |
|
first_name = mindbody.iloc[idx]['First Name'] |
|
last_name = mindbody.iloc[idx]['Last Name'] |
|
|
|
|
|
date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)] |
|
|
|
date_range = [d.date() for d in date_range] |
|
|
|
|
|
matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) & |
|
((medserv['First Name'].str.lower() == first_name.lower()) | |
|
(medserv['Last Name'].str.lower() == last_name.lower())))] |
|
|
|
|
|
if matches.empty: |
|
unmatched_rows.append(mindbody.iloc[idx]) |
|
|
|
|
|
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns) |
|
|
|
|
|
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total'] |
|
|
|
|
|
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y') |
|
|
|
output_file_path = 'Comparison Results.xlsx' |
|
unmatched_df[columns_to_include].to_excel(output_file_path, index=False) |
|
|
|
return output_file_path |
|
|
|
|
|
|
|
|
|
def load_data(files): |
|
|
|
filepaths = [file.name for file in files] |
|
|
|
|
|
dfs = [] |
|
for filepath in filepaths: |
|
if filepath.endswith('.xlsx') or filepath.endswith('.xls'): |
|
dfs.append(pd.read_excel(filepath)) |
|
else: |
|
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file") |
|
|
|
|
|
if len(dfs) > 1: |
|
df = pd.concat(dfs, ignore_index=True) |
|
else: |
|
df = dfs[0] |
|
|
|
|
|
date_column = find_date_column(df) |
|
if date_column: |
|
df.rename(columns={date_column: 'DOS'}, inplace=True) |
|
|
|
|
|
name_column = find_name_column(df) |
|
if name_column: |
|
df.rename(columns={name_column: 'Client'}, inplace=True) |
|
|
|
return df |
|
|
|
|
|
def find_name_column(df): |
|
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" |
|
|
|
max_count = 0 |
|
name_column = None |
|
|
|
for column in df.columns: |
|
|
|
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x))) |
|
valid_count = matches.sum() |
|
|
|
|
|
if valid_count > max_count: |
|
max_count = valid_count |
|
name_column = column |
|
|
|
return name_column |
|
|
|
|
|
def find_date_column(df): |
|
|
|
if 'Treatment dates' in df.columns: |
|
return 'Treatment dates' |
|
|
|
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" |
|
|
|
max_count = 0 |
|
date_column = None |
|
|
|
for column in df.columns: |
|
|
|
matches = df[column].astype(str).str.contains(date_pattern, na=False) |
|
valid_count = matches.sum() |
|
|
|
|
|
if valid_count > max_count: |
|
max_count = valid_count |
|
date_column = column |
|
|
|
return date_column |
|
|