Spaces:

Alealejandrooo
/

MindBody_VS_Medserv

Sleeping

App Files Files Community

MindBody_VS_Medserv / process.py

Alealejandrooo

First commit

18c6efd verified 4 months ago

raw

history blame

No virus

4.49 kB

	import pandas as pd
	import gradio as gr
	import re
	from datetime import timedelta


	def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):

	mindbody = load_data(files_mindbody)
	medserv = load_data(files_medserv)

	# Split 'Client' names into first name and last name components for both DataFrames
	medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
	mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
	# Initialize an empty list to store unmatched rows
	unmatched_rows = []

	rows = len(mindbody)

	# Iterate through each row in the mindbody DataFrame
	for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
	# Extract relevant information from the current row
	date = mindbody.iloc[idx]['DOS']
	first_name = mindbody.iloc[idx]['First Name']
	last_name = mindbody.iloc[idx]['Last Name']

	# Define the range of dates to search for a match in medserv
	date_range = [date - timedelta(days= tollerance), date, date + timedelta(days=tollerance)]

	# Filter medserv based on the date range and name criteria
	matches = medserv[((medserv['DOS'].isin(date_range)) &
	((medserv['First Name'] == first_name) \|
	(medserv['Last Name'] == last_name)))]

	# If no match is found, append the row to the unmatched_rows list
	if matches.empty:
	unmatched_rows.append(mindbody.iloc[idx])

	# Create a DataFrame from the unmatched_rows list
	unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)

	# Specify the columns to include in the output Excel file
	columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']

	# Format the 'DOS' column to remove time part
	unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')

	output_file_path = 'Comparison Results.xlsx'
	unmatched_df[columns_to_include].to_excel(output_file_path, index=False)

	return output_file_path



	def load_data(files):
	# Check if a single file or multiple files are provided
	filepaths = [file.name for file in files]

	# Load and concatenate multiple files if provided
	dfs = []
	for filepath in filepaths:
	if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
	dfs.append(pd.read_excel(filepath))
	else:
	raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")

	# Concatenate dataframes if more than one file is provided
	if len(dfs) > 1:
	df = pd.concat(dfs, ignore_index=True)
	else:
	df = dfs[0]

	# Find and rename the date column to 'DOS'
	date_column = find_date_column(df)
	if date_column:
	df.rename(columns={date_column: 'DOS'}, inplace=True)

	# Find and rename the name column to 'Client'
	name_column = find_name_column(df)
	if name_column:
	df.rename(columns={name_column: 'Client'}, inplace=True)

	return df


	def find_name_column(df):
	name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)

	max_count = 0
	name_column = None

	for column in df.columns:
	# Count matches of the name pattern in each column
	matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
	valid_count = matches.sum() # Sum of True values indicating valid names

	# Select the column with the maximum count of valid names
	if valid_count > max_count:
	max_count = valid_count
	name_column = column

	return name_column


	def find_date_column(df):
	date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats

	max_count = 0
	date_column = None

	for column in df.columns:
	# Count matches of the date pattern in each column
	matches = df[column].astype(str).str.contains(date_pattern, na=False)
	valid_count = matches.sum() # Sum of True values indicating valid dates

	# Select the column with the maximum count of valid dates
	if valid_count > max_count:
	max_count = valid_count
	date_column = column

	return date_column