File size: 5,551 Bytes
526fd5a 605feb9 fbec7e3 605feb9 526fd5a 605feb9 526fd5a 605feb9 526fd5a 605feb9 526fd5a 605feb9 526fd5a 605feb9 526fd5a 605feb9 526fd5a 605feb9 62af0a0 526fd5a 1faefb1 526fd5a 1faefb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import pandas as pd
import gradio as gr
import re
from datetime import timedelta
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
mindbody = load_data(files_mindbody)
medserv = load_data(files_medserv)
medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)
# Split 'Client' names into first name and last name components for both DataFrames
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y')
# Split dates if they contain commas in the 'DOS' column of medserv
medserv['DOS'] = medserv['DOS'].astype(str)
medserv['DOS'] = medserv['DOS'].str.split(',')
medserv = medserv.explode('DOS')
# Attempt to convert dates using multiple formats
formats_to_try = ['%d/%m/%Y', '%Y-%m-%d'] # Add more formats as needed
for format_to_try in formats_to_try:
try:
medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
break # Break out of loop if conversion succeeds
except ValueError:
continue # Continue to next format if conversion fails
# Initialize an empty list to store unmatched rows
unmatched_rows = []
rows = len(mindbody)
# Iterate through each row in the mindbody DataFrame
for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
# Extract relevant information from the current row
date = mindbody.iloc[idx]['DOS']
first_name = mindbody.iloc[idx]['First Name']
last_name = mindbody.iloc[idx]['Last Name']
# Define the range of dates to search for a match in medserv
date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
# Remove the time component from the dates in date_range
date_range = [d.date() for d in date_range]
# Filter medserv based on the date range and name criteria
matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
((medserv['First Name'].str.lower() == first_name.lower()) |
(medserv['Last Name'].str.lower() == last_name.lower())))]
# If no match is found, append the row to the unmatched_rows list
if matches.empty:
unmatched_rows.append(mindbody.iloc[idx])
# Create a DataFrame from the unmatched_rows list
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
# Specify the columns to include in the output Excel file
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']
# Format the 'DOS' column to remove time part
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
output_file_path = 'Comparison Results.xlsx'
unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
return output_file_path
def load_data(files):
# Check if a single file or multiple files are provided
filepaths = [file.name for file in files]
# Load and concatenate multiple files if provided
dfs = []
for filepath in filepaths:
if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
dfs.append(pd.read_excel(filepath))
else:
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
# Concatenate dataframes if more than one file is provided
if len(dfs) > 1:
df = pd.concat(dfs, ignore_index=True)
else:
df = dfs[0]
# Find and rename the date column to 'DOS'
date_column = find_date_column(df)
if date_column:
df.rename(columns={date_column: 'DOS'}, inplace=True)
# Find and rename the name column to 'Client'
name_column = find_name_column(df)
if name_column:
df.rename(columns={name_column: 'Client'}, inplace=True)
return df
def find_name_column(df):
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
max_count = 0
name_column = None
for column in df.columns:
# Count matches of the name pattern in each column
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
valid_count = matches.sum() # Sum of True values indicating valid names
# Select the column with the maximum count of valid names
if valid_count > max_count:
max_count = valid_count
name_column = column
return name_column
def find_date_column(df):
# Check if 'Treatment dates' column exists
if 'Treatment dates' in df.columns:
return 'Treatment dates'
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
max_count = 0
date_column = None
for column in df.columns:
# Count matches of the date pattern in each column
matches = df[column].astype(str).str.contains(date_pattern, na=False)
valid_count = matches.sum() # Sum of True values indicating valid dates
# Select the column with the maximum count of valid dates
if valid_count > max_count:
max_count = valid_count
date_column = column
return date_column
|