MindBody_VS_Medserv / process.py
Alealejandrooo's picture
u[dated main function
605feb9 verified
import pandas as pd
import gradio as gr
import re
from datetime import timedelta
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
mindbody = load_data(files_mindbody)
medserv = load_data(files_medserv)
medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)
# Split 'Client' names into first name and last name components for both DataFrames
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y')
# Split dates if they contain commas in the 'DOS' column of medserv
medserv['DOS'] = medserv['DOS'].astype(str)
medserv['DOS'] = medserv['DOS'].str.split(',')
medserv = medserv.explode('DOS')
# Attempt to convert dates using multiple formats
formats_to_try = ['%d/%m/%Y', '%Y-%m-%d'] # Add more formats as needed
for format_to_try in formats_to_try:
try:
medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
break # Break out of loop if conversion succeeds
except ValueError:
continue # Continue to next format if conversion fails
# Initialize an empty list to store unmatched rows
unmatched_rows = []
rows = len(mindbody)
# Iterate through each row in the mindbody DataFrame
for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
# Extract relevant information from the current row
date = mindbody.iloc[idx]['DOS']
first_name = mindbody.iloc[idx]['First Name']
last_name = mindbody.iloc[idx]['Last Name']
# Define the range of dates to search for a match in medserv
date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
# Remove the time component from the dates in date_range
date_range = [d.date() for d in date_range]
# Filter medserv based on the date range and name criteria
matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
((medserv['First Name'].str.lower() == first_name.lower()) |
(medserv['Last Name'].str.lower() == last_name.lower())))]
# If no match is found, append the row to the unmatched_rows list
if matches.empty:
unmatched_rows.append(mindbody.iloc[idx])
# Create a DataFrame from the unmatched_rows list
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
# Specify the columns to include in the output Excel file
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']
# Format the 'DOS' column to remove time part
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
output_file_path = 'Comparison Results.xlsx'
unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
return output_file_path
def load_data(files):
# Check if a single file or multiple files are provided
filepaths = [file.name for file in files]
# Load and concatenate multiple files if provided
dfs = []
for filepath in filepaths:
if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
dfs.append(pd.read_excel(filepath))
else:
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
# Concatenate dataframes if more than one file is provided
if len(dfs) > 1:
df = pd.concat(dfs, ignore_index=True)
else:
df = dfs[0]
# Find and rename the date column to 'DOS'
date_column = find_date_column(df)
if date_column:
df.rename(columns={date_column: 'DOS'}, inplace=True)
# Find and rename the name column to 'Client'
name_column = find_name_column(df)
if name_column:
df.rename(columns={name_column: 'Client'}, inplace=True)
return df
def find_name_column(df):
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
max_count = 0
name_column = None
for column in df.columns:
# Count matches of the name pattern in each column
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
valid_count = matches.sum() # Sum of True values indicating valid names
# Select the column with the maximum count of valid names
if valid_count > max_count:
max_count = valid_count
name_column = column
return name_column
def find_date_column(df):
# Check if 'Treatment dates' column exists
if 'Treatment dates' in df.columns:
return 'Treatment dates'
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
max_count = 0
date_column = None
for column in df.columns:
# Count matches of the date pattern in each column
matches = df[column].astype(str).str.contains(date_pattern, na=False)
valid_count = matches.sum() # Sum of True values indicating valid dates
# Select the column with the maximum count of valid dates
if valid_count > max_count:
max_count = valid_count
date_column = column
return date_column