|
import pandas as pd
|
|
import gradio as gr
|
|
import re
|
|
from datetime import timedelta
|
|
|
|
|
|
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
|
|
|
|
mindbody = load_data(files_mindbody)
|
|
medserv = load_data(files_medserv)
|
|
|
|
|
|
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
|
|
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
|
|
|
|
unmatched_rows = []
|
|
|
|
rows = len(mindbody)
|
|
|
|
|
|
for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
|
|
|
|
date = mindbody.iloc[idx]['DOS']
|
|
first_name = mindbody.iloc[idx]['First Name']
|
|
last_name = mindbody.iloc[idx]['Last Name']
|
|
|
|
|
|
date_range = [date - timedelta(days= tollerance), date, date + timedelta(days=tollerance)]
|
|
|
|
|
|
matches = medserv[((medserv['DOS'].isin(date_range)) &
|
|
((medserv['First Name'] == first_name) |
|
|
(medserv['Last Name'] == last_name)))]
|
|
|
|
|
|
if matches.empty:
|
|
unmatched_rows.append(mindbody.iloc[idx])
|
|
|
|
|
|
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
|
|
|
|
|
|
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
|
|
|
|
|
|
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
|
|
|
|
output_file_path = 'Comparison Results.xlsx'
|
|
unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
|
|
|
|
return output_file_path
|
|
|
|
|
|
|
|
def load_data(files):
|
|
|
|
filepaths = [file.name for file in files]
|
|
|
|
|
|
dfs = []
|
|
for filepath in filepaths:
|
|
if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
|
|
dfs.append(pd.read_excel(filepath))
|
|
else:
|
|
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
|
|
|
|
|
|
if len(dfs) > 1:
|
|
df = pd.concat(dfs, ignore_index=True)
|
|
else:
|
|
df = dfs[0]
|
|
|
|
|
|
date_column = find_date_column(df)
|
|
if date_column:
|
|
df.rename(columns={date_column: 'DOS'}, inplace=True)
|
|
|
|
|
|
name_column = find_name_column(df)
|
|
if name_column:
|
|
df.rename(columns={name_column: 'Client'}, inplace=True)
|
|
|
|
return df
|
|
|
|
|
|
def find_name_column(df):
|
|
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"
|
|
|
|
max_count = 0
|
|
name_column = None
|
|
|
|
for column in df.columns:
|
|
|
|
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
|
|
valid_count = matches.sum()
|
|
|
|
|
|
if valid_count > max_count:
|
|
max_count = valid_count
|
|
name_column = column
|
|
|
|
return name_column
|
|
|
|
|
|
def find_date_column(df):
|
|
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"
|
|
|
|
max_count = 0
|
|
date_column = None
|
|
|
|
for column in df.columns:
|
|
|
|
matches = df[column].astype(str).str.contains(date_pattern, na=False)
|
|
valid_count = matches.sum()
|
|
|
|
|
|
if valid_count > max_count:
|
|
max_count = valid_count
|
|
date_column = column
|
|
|
|
return date_column |