Spaces:

Alealejandrooo
/

MindBody_VS_Medserv

Running

File size: 6,419 Bytes

import pandas as pd
import gradio as gr
import re 
from datetime import timedelta


def process_data(files_mindbody, files_medserv, tolerance, progress=gr.Progress()):

    try:
        mindbody = load_data(files_mindbody)
        medserv = load_data(files_medserv)
    except Exception as e:
        print(f"An error occurred while loading data: {e}")
        return None
    
    try:
        # Remove multiple commas from the 'Client' column
        medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
        mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)
        
        # Split 'Client' names into first name and last name components for both DataFrames
        medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
        mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
    except Exception as e:
        print(f"An error occurred while processing client names: {e}")
    
    try:
        mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y')
    except Exception as e:
        print(f"An error occurred while converting dates in mindbody: {e}")
    
    try:
        # Split dates if they contain commas in the 'DOS' column of medserv
        medserv['DOS'] = medserv['DOS'].astype(str)
        medserv['DOS'] = medserv['DOS'].str.split(',')
        medserv = medserv.explode('DOS')
        
        # Attempt to convert dates using multiple formats
        formats_to_try = ['%d/%m/%Y', '%Y-%m-%d']  # Add more formats as needed
        for format_to_try in formats_to_try:
            try:
                medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
                break  # Break out of loop if conversion succeeds
            except ValueError:
                continue  # Continue to next format if conversion fails
    except Exception as e:
        print(f"An error occurred while processing dates in medserv: {e}")
    
    unmatched_rows = []

    try:
        rows = len(mindbody)

        # Iterate through each row in the mindbody DataFrame
        for idx in progress(range(rows), desc='Analyzing files...'):
            # Extract relevant information from the current row
            date = mindbody.iloc[idx]['DOS']
            first_name = mindbody.iloc[idx]['First Name']
            last_name = mindbody.iloc[idx]['Last Name']

            # Define the range of dates to search for a match in medserv
            date_range = [date - timedelta(days=i) for i in range(tolerance, -tolerance-1, -1)]
            # Remove the time component from the dates in date_range
            date_range = [d.date() for d in date_range]

            # Filter medserv based on the date range and name criteria
            matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) & 
                               ((medserv['First Name'].str.lower() == first_name.lower()) | 
                                (medserv['Last Name'].str.lower() == last_name.lower())))]

            # If no match is found, append the row to the unmatched_rows list
            if matches.empty:
                unmatched_rows.append(mindbody.iloc[idx])
    except Exception as e:
        print(f"An error occurred while analyzing files: {e}")

    try:
        # Create a DataFrame from the unmatched_rows list
        unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)

        # Specify the columns to include in the output Excel file
        columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']

        # Format the 'DOS' column to remove time part
        unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')

        output_file_path = 'Comparison Results.xlsx'
        unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
        
        return output_file_path
    except Exception as e:
        print(f"An error occurred while creating the output file: {e}")
        return None



def load_data(files):
    # Check if a single file or multiple files are provided
    filepaths = [file.name for file in files]
    
    # Load and concatenate multiple files if provided
    dfs = []
    for filepath in filepaths:
        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
            dfs.append(pd.read_excel(filepath))
        else:
            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
    
    # Concatenate dataframes if more than one file is provided
    if len(dfs) > 1:
        df = pd.concat(dfs, ignore_index=True)
    else:
        df = dfs[0]
    
    # Find and rename the date column to 'DOS'
    date_column = find_date_column(df)
    if date_column:
        df.rename(columns={date_column: 'DOS'}, inplace=True)

    # Find and rename the name column to 'Client'
    name_column = find_name_column(df)
    if name_column:
        df.rename(columns={name_column: 'Client'}, inplace=True)

    return df


def find_name_column(df):
    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)

    max_count = 0
    name_column = None

    for column in df.columns:
        # Count matches of the name pattern in each column
        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
        valid_count = matches.sum()  # Sum of True values indicating valid names

        # Select the column with the maximum count of valid names
        if valid_count > max_count:
            max_count = valid_count
            name_column = column

    return name_column


def find_date_column(df):
    # Check if 'Treatment dates' column exists
    if 'Treatment dates' in df.columns:
        return 'Treatment dates'

    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats

    max_count = 0
    date_column = None

    for column in df.columns:
        # Count matches of the date pattern in each column
        matches = df[column].astype(str).str.contains(date_pattern, na=False)
        valid_count = matches.sum()  # Sum of True values indicating valid dates

        # Select the column with the maximum count of valid dates
        if valid_count > max_count:
            max_count = valid_count
            date_column = column

    return date_column