File size: 5,551 Bytes
526fd5a
 
 
 
 
 
605feb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbec7e3
605feb9
526fd5a
 
605feb9
526fd5a
605feb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526fd5a
605feb9
 
526fd5a
605feb9
 
526fd5a
605feb9
 
526fd5a
605feb9
 
526fd5a
605feb9
62af0a0
526fd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1faefb1
 
 
 
526fd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1faefb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import gradio as gr
import re 
from datetime import timedelta


def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):

    mindbody = load_data(files_mindbody)
    medserv = load_data(files_medserv)

    medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True)
    mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True)

    # Split 'Client' names into first name and last name components for both DataFrames
    medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
    mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)

    mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y')

      # Split dates if they contain commas in the 'DOS' column of medserv
    medserv['DOS'] = medserv['DOS'].astype(str)
    medserv['DOS'] = medserv['DOS'].str.split(',')
    medserv = medserv.explode('DOS')

    # Attempt to convert dates using multiple formats
    formats_to_try = ['%d/%m/%Y', '%Y-%m-%d']  # Add more formats as needed
    for format_to_try in formats_to_try:
        try:
            medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try)
            break  # Break out of loop if conversion succeeds
        except ValueError:
            continue  # Continue to next format if conversion fails
    
    # Initialize an empty list to store unmatched rows
    unmatched_rows = []

    rows = len(mindbody)

    # Iterate through each row in the mindbody DataFrame
    for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
        # Extract relevant information from the current row
        date = mindbody.iloc[idx]['DOS']
        first_name = mindbody.iloc[idx]['First Name']
        last_name = mindbody.iloc[idx]['Last Name']
        
        # Define the range of dates to search for a match in medserv
        date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
        # Remove the time component from the dates in date_range
        date_range = [d.date() for d in date_range]
        
        # Filter medserv based on the date range and name criteria
        matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) & 
                        ((medserv['First Name'].str.lower() == first_name.lower()) | 
                            (medserv['Last Name'].str.lower() == last_name.lower())))]
        
        # If no match is found, append the row to the unmatched_rows list
        if matches.empty:
            unmatched_rows.append(mindbody.iloc[idx])

    # Create a DataFrame from the unmatched_rows list
    unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)

    # Specify the columns to include in the output Excel file
    columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total']

    # Format the 'DOS' column to remove time part
    unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')

    output_file_path = 'Comparison Results.xlsx'
    unmatched_df[columns_to_include].to_excel(output_file_path, index=False)

    return output_file_path




def load_data(files):
    # Check if a single file or multiple files are provided
    filepaths = [file.name for file in files]
    
    # Load and concatenate multiple files if provided
    dfs = []
    for filepath in filepaths:
        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
            dfs.append(pd.read_excel(filepath))
        else:
            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
    
    # Concatenate dataframes if more than one file is provided
    if len(dfs) > 1:
        df = pd.concat(dfs, ignore_index=True)
    else:
        df = dfs[0]
    
    # Find and rename the date column to 'DOS'
    date_column = find_date_column(df)
    if date_column:
        df.rename(columns={date_column: 'DOS'}, inplace=True)

    # Find and rename the name column to 'Client'
    name_column = find_name_column(df)
    if name_column:
        df.rename(columns={name_column: 'Client'}, inplace=True)

    return df


def find_name_column(df):
    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)

    max_count = 0
    name_column = None

    for column in df.columns:
        # Count matches of the name pattern in each column
        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
        valid_count = matches.sum()  # Sum of True values indicating valid names

        # Select the column with the maximum count of valid names
        if valid_count > max_count:
            max_count = valid_count
            name_column = column

    return name_column


def find_date_column(df):
    # Check if 'Treatment dates' column exists
    if 'Treatment dates' in df.columns:
        return 'Treatment dates'

    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats

    max_count = 0
    date_column = None

    for column in df.columns:
        # Count matches of the date pattern in each column
        matches = df[column].astype(str).str.contains(date_pattern, na=False)
        valid_count = matches.sum()  # Sum of True values indicating valid dates

        # Select the column with the maximum count of valid dates
        if valid_count > max_count:
            max_count = valid_count
            date_column = column

    return date_column