File size: 4,489 Bytes
18c6efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import gradio as gr
import re 
from datetime import timedelta


def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):

    mindbody = load_data(files_mindbody)
    medserv = load_data(files_medserv)

    # Split 'Client' names into first name and last name components for both DataFrames
    medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
    mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
    # Initialize an empty list to store unmatched rows
    unmatched_rows = []

    rows = len(mindbody)

    # Iterate through each row in the mindbody DataFrame
    for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
        # Extract relevant information from the current row
        date = mindbody.iloc[idx]['DOS']
        first_name = mindbody.iloc[idx]['First Name']
        last_name = mindbody.iloc[idx]['Last Name']
        
        # Define the range of dates to search for a match in medserv
        date_range = [date - timedelta(days= tollerance), date, date + timedelta(days=tollerance)]
        
        # Filter medserv based on the date range and name criteria
        matches = medserv[((medserv['DOS'].isin(date_range)) & 
                        ((medserv['First Name'] == first_name) | 
                            (medserv['Last Name'] == last_name)))]
        
        # If no match is found, append the row to the unmatched_rows list
        if matches.empty:
            unmatched_rows.append(mindbody.iloc[idx])

    # Create a DataFrame from the unmatched_rows list
    unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)

    # Specify the columns to include in the output Excel file
    columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']

    # Format the 'DOS' column to remove time part
    unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')

    output_file_path = 'Comparison Results.xlsx'
    unmatched_df[columns_to_include].to_excel(output_file_path, index=False)

    return output_file_path



def load_data(files):
    # Check if a single file or multiple files are provided
    filepaths = [file.name for file in files]
    
    # Load and concatenate multiple files if provided
    dfs = []
    for filepath in filepaths:
        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
            dfs.append(pd.read_excel(filepath))
        else:
            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
    
    # Concatenate dataframes if more than one file is provided
    if len(dfs) > 1:
        df = pd.concat(dfs, ignore_index=True)
    else:
        df = dfs[0]
    
    # Find and rename the date column to 'DOS'
    date_column = find_date_column(df)
    if date_column:
        df.rename(columns={date_column: 'DOS'}, inplace=True)

    # Find and rename the name column to 'Client'
    name_column = find_name_column(df)
    if name_column:
        df.rename(columns={name_column: 'Client'}, inplace=True)

    return df


def find_name_column(df):
    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)

    max_count = 0
    name_column = None

    for column in df.columns:
        # Count matches of the name pattern in each column
        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
        valid_count = matches.sum()  # Sum of True values indicating valid names

        # Select the column with the maximum count of valid names
        if valid_count > max_count:
            max_count = valid_count
            name_column = column

    return name_column


def find_date_column(df):
    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats

    max_count = 0
    date_column = None

    for column in df.columns:
        # Count matches of the date pattern in each column
        matches = df[column].astype(str).str.contains(date_pattern, na=False)
        valid_count = matches.sum()  # Sum of True values indicating valid dates

        # Select the column with the maximum count of valid dates
        if valid_count > max_count:
            max_count = valid_count
            date_column = column

    return date_column