Spaces:
Sleeping
Sleeping
Ronan
commited on
Commit
•
dd6a24d
1
Parent(s):
ec6dd69
feat: add new filters
Browse files- __pycache__/utils.cpython-310.pyc +0 -0
- country_by_country/utils/__pycache__/utils.cpython-310.pyc +0 -0
- extract_config.yaml +2 -1
- pages/0_Import_File.py +78 -46
- pages/1_Selected_Pages.py +2 -1
- pages/2_Metadata.py +14 -1
- pages/5_Clean_Tables.py +100 -12
- utils.py +24 -0
__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
country_by_country/utils/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/country_by_country/utils/__pycache__/utils.cpython-310.pyc and b/country_by_country/utils/__pycache__/utils.cpython-310.pyc differ
|
|
extract_config.yaml
CHANGED
@@ -4,7 +4,8 @@ pagefilter:
|
|
4 |
modelfile: random_forest_model_low_false_positive.joblib
|
5 |
|
6 |
table_extraction:
|
|
|
7 |
- type: Unstructured
|
8 |
params:
|
9 |
hi_res_model_name: "yolox"
|
10 |
-
pdf_image_dpi: 300
|
|
|
4 |
modelfile: random_forest_model_low_false_positive.joblib
|
5 |
|
6 |
table_extraction:
|
7 |
+
- type: LlamaParse
|
8 |
- type: Unstructured
|
9 |
params:
|
10 |
hi_res_model_name: "yolox"
|
11 |
+
pdf_image_dpi: 300
|
pages/0_Import_File.py
CHANGED
@@ -7,7 +7,7 @@ import yaml
|
|
7 |
import copy
|
8 |
from menu import display_pages_menu, display_config
|
9 |
from pypdf import PdfReader
|
10 |
-
from utils import get_pdf_iframe, set_state
|
11 |
|
12 |
from country_by_country.processor import ReportProcessor
|
13 |
|
@@ -18,6 +18,48 @@ def set_page_filter(value: dict):
|
|
18 |
set_state(["config", "pagefilter"], value)
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
|
22 |
st.title("Country by Country Tax Reporting analysis")
|
23 |
st.subheader(
|
@@ -27,6 +69,23 @@ display_pages_menu()
|
|
27 |
|
28 |
mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
with st.sidebar:
|
31 |
|
32 |
st.markdown("# PDF Upload")
|
@@ -34,13 +93,10 @@ with st.sidebar:
|
|
34 |
st.markdown("## PDF Report to process")
|
35 |
original_pdf = st.file_uploader(
|
36 |
"Upload a pdf document containing financial table : ",
|
|
|
|
|
37 |
)
|
38 |
|
39 |
-
if original_pdf is not None:
|
40 |
-
mytmpfile.write(original_pdf.read())
|
41 |
-
st.session_state["working_file_pdf"] = mytmpfile
|
42 |
-
st.session_state["original_pdf_name"] = original_pdf.name
|
43 |
-
|
44 |
if "original_pdf_name" in st.session_state:
|
45 |
st.markdown(
|
46 |
"Already loaded file : " + st.session_state["original_pdf_name"],
|
@@ -50,7 +106,10 @@ with st.sidebar:
|
|
50 |
# Upload personalized config if required
|
51 |
loaded_config = st.file_uploader(
|
52 |
"Upload a config if the default config doesn't suit you :",
|
|
|
|
|
53 |
)
|
|
|
54 |
if loaded_config is not None:
|
55 |
if not loaded_config.name.endswith(".yaml"):
|
56 |
st.error("Please upload a yaml file")
|
@@ -69,26 +128,28 @@ with st.sidebar:
|
|
69 |
loaded_config = None
|
70 |
|
71 |
# Extract config
|
72 |
-
with open("extract_config.yaml", "r") as f:
|
73 |
-
default_config = f.read()
|
74 |
-
|
75 |
-
if not st.session_state.get("config_is_set", False):
|
76 |
-
st.session_state["initial_config"] = yaml.safe_load(default_config)
|
77 |
-
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
|
78 |
-
st.session_state["config_is_set"] = True
|
79 |
|
80 |
if bool(loaded_config):
|
81 |
st.session_state["initial_config"] = loaded_config_dict
|
82 |
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
|
83 |
-
st.session_state["config_is_set"] = True
|
84 |
|
85 |
# Set page filter
|
86 |
-
|
87 |
pagefilter["type"]: pagefilter
|
88 |
for pagefilter in st.session_state["initial_config"]["pagefilter"]
|
89 |
}
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
display_config()
|
94 |
|
@@ -103,32 +164,3 @@ if "working_file_pdf" in st.session_state:
|
|
103 |
get_pdf_iframe(st.session_state["working_file_pdf"].name),
|
104 |
unsafe_allow_html=True,
|
105 |
)
|
106 |
-
|
107 |
-
if "first_time" not in st.session_state:
|
108 |
-
st.session_state["first_time"] = False
|
109 |
-
logging.info("Loading config and pdf")
|
110 |
-
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
|
111 |
-
|
112 |
-
logging.info("Config and pdf loaded")
|
113 |
-
|
114 |
-
assets = {
|
115 |
-
"pagefilter": {},
|
116 |
-
"table_extractors": [],
|
117 |
-
}
|
118 |
-
|
119 |
-
# Filtering the pages
|
120 |
-
st.session_state["proc"].page_filter(
|
121 |
-
st.session_state["working_file_pdf"].name,
|
122 |
-
assets,
|
123 |
-
)
|
124 |
-
|
125 |
-
logging.info(f"Assets : {assets}")
|
126 |
-
|
127 |
-
if len(assets["pagefilter"]["selected_pages"]) == 0:
|
128 |
-
# No page has been automatically selected by the page filter
|
129 |
-
# Hence, we display the full pdf, letting the user select the pages
|
130 |
-
pdfreader = PdfReader(st.session_state["working_file_pdf"])
|
131 |
-
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
|
132 |
-
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
|
133 |
-
st.session_state["assets"] = assets
|
134 |
-
st.switch_page("pages/1_Selected_Pages.py")
|
|
|
7 |
import copy
|
8 |
from menu import display_pages_menu, display_config
|
9 |
from pypdf import PdfReader
|
10 |
+
from utils import get_pdf_iframe, set_state, generate_assets
|
11 |
|
12 |
from country_by_country.processor import ReportProcessor
|
13 |
|
|
|
18 |
set_state(["config", "pagefilter"], value)
|
19 |
|
20 |
|
21 |
+
def initiate_configuration() -> None:
|
22 |
+
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
|
23 |
+
if isinstance(st.session_state["config"]["pagefilter"], list):
|
24 |
+
st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][
|
25 |
+
"pagefilter"
|
26 |
+
][0]
|
27 |
+
st.session_state["selected_page_filter_name"] = st.session_state["config"][
|
28 |
+
"pagefilter"
|
29 |
+
]["type"]
|
30 |
+
|
31 |
+
|
32 |
+
def on_pdf_file_upload() -> None:
|
33 |
+
# Change states related to the pdf file upload
|
34 |
+
mytmpfile.write(st.session_state.original_pdf.read())
|
35 |
+
st.session_state["working_file_pdf"] = mytmpfile
|
36 |
+
st.session_state["original_pdf_name"] = st.session_state.original_pdf.name
|
37 |
+
|
38 |
+
# Generate assets
|
39 |
+
generate_assets()
|
40 |
+
|
41 |
+
st.session_state["page_redirection"] = "pages/1_Selected_Pages.py"
|
42 |
+
|
43 |
+
|
44 |
+
def on_config_file_upload() -> None:
|
45 |
+
st.session_state["initial_config"] = st.session_state["initial_uploaded_config"]
|
46 |
+
initiate_configuration()
|
47 |
+
|
48 |
+
|
49 |
+
def on_change_page_filter(name_to_filter_dict: dict) -> None:
|
50 |
+
st.session_state["selected_page_filter_name"] = st.session_state[
|
51 |
+
"radio_button_filter_selection"
|
52 |
+
] # this 'buffer' is needed because selectors wipe their key on reload
|
53 |
+
set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]])
|
54 |
+
|
55 |
+
|
56 |
+
# Check if a redirection was requested
|
57 |
+
# Workaround because st.switch_page is not allowed in a callback function
|
58 |
+
if st.session_state.get("page_redirection", False):
|
59 |
+
page_to_redirect_to = st.session_state["page_redirection"]
|
60 |
+
st.session_state["page_redirection"] = False
|
61 |
+
st.switch_page(page_to_redirect_to)
|
62 |
+
|
63 |
st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
|
64 |
st.title("Country by Country Tax Reporting analysis")
|
65 |
st.subheader(
|
|
|
69 |
|
70 |
mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
71 |
|
72 |
+
# State initialization
|
73 |
+
if "first_time" not in st.session_state:
|
74 |
+
logging.info("State initialization...")
|
75 |
+
st.session_state["first_time"] = False
|
76 |
+
|
77 |
+
logging.info("... loading default extract config")
|
78 |
+
with open("extract_config.yaml", "r") as f:
|
79 |
+
st.session_state["initial_config"] = yaml.safe_load(f.read())
|
80 |
+
initiate_configuration()
|
81 |
+
|
82 |
+
logging.info("... initializing processor and assets")
|
83 |
+
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
|
84 |
+
st.session_state["assets"] = {
|
85 |
+
"pagefilter": {},
|
86 |
+
"table_extractors": [],
|
87 |
+
}
|
88 |
+
|
89 |
with st.sidebar:
|
90 |
|
91 |
st.markdown("# PDF Upload")
|
|
|
93 |
st.markdown("## PDF Report to process")
|
94 |
original_pdf = st.file_uploader(
|
95 |
"Upload a pdf document containing financial table : ",
|
96 |
+
key="original_pdf",
|
97 |
+
on_change=on_pdf_file_upload,
|
98 |
)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
if "original_pdf_name" in st.session_state:
|
101 |
st.markdown(
|
102 |
"Already loaded file : " + st.session_state["original_pdf_name"],
|
|
|
106 |
# Upload personalized config if required
|
107 |
loaded_config = st.file_uploader(
|
108 |
"Upload a config if the default config doesn't suit you :",
|
109 |
+
key="initial_uploaded_config",
|
110 |
+
on_change=initiate_configuration,
|
111 |
)
|
112 |
+
|
113 |
if loaded_config is not None:
|
114 |
if not loaded_config.name.endswith(".yaml"):
|
115 |
st.error("Please upload a yaml file")
|
|
|
128 |
loaded_config = None
|
129 |
|
130 |
# Extract config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
if bool(loaded_config):
|
133 |
st.session_state["initial_config"] = loaded_config_dict
|
134 |
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
|
|
|
135 |
|
136 |
# Set page filter
|
137 |
+
page_filter_name_to_config_mapping = {
|
138 |
pagefilter["type"]: pagefilter
|
139 |
for pagefilter in st.session_state["initial_config"]["pagefilter"]
|
140 |
}
|
141 |
+
page_filter_list = list(page_filter_name_to_config_mapping.keys())
|
142 |
+
current_selected_page_filter_index = page_filter_list.index(
|
143 |
+
st.session_state["selected_page_filter_name"]
|
144 |
+
)
|
145 |
+
selected_page_filter_name = st.radio(
|
146 |
+
"Page filter",
|
147 |
+
page_filter_list,
|
148 |
+
index=current_selected_page_filter_index,
|
149 |
+
on_change=on_change_page_filter,
|
150 |
+
key="radio_button_filter_selection",
|
151 |
+
args=(page_filter_name_to_config_mapping,),
|
152 |
+
)
|
153 |
|
154 |
display_config()
|
155 |
|
|
|
164 |
get_pdf_iframe(st.session_state["working_file_pdf"].name),
|
165 |
unsafe_allow_html=True,
|
166 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/1_Selected_Pages.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from country_by_country.processor import ReportProcessor
|
3 |
-
from utils import get_pdf_iframe, set_state
|
4 |
from country_by_country.utils.utils import keep_pages
|
5 |
from pypdf import PdfReader
|
6 |
from menu import display_pages_menu, display_config
|
@@ -29,6 +29,7 @@ def set_extractors() -> None:
|
|
29 |
]
|
30 |
set_state(["config", "table_extraction"], selected_extractors_dict)
|
31 |
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
|
|
|
32 |
|
33 |
|
34 |
st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈"
|
|
|
1 |
import streamlit as st
|
2 |
from country_by_country.processor import ReportProcessor
|
3 |
+
from utils import get_pdf_iframe, set_state, generate_assets
|
4 |
from country_by_country.utils.utils import keep_pages
|
5 |
from pypdf import PdfReader
|
6 |
from menu import display_pages_menu, display_config
|
|
|
29 |
]
|
30 |
set_state(["config", "table_extraction"], selected_extractors_dict)
|
31 |
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
|
32 |
+
generate_assets()
|
33 |
|
34 |
|
35 |
st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈"
|
pages/2_Metadata.py
CHANGED
@@ -41,6 +41,7 @@ if "pdf_after_page_validation" in st.session_state:
|
|
41 |
currency = st.session_state["metadata"]["currency"]
|
42 |
unit = st.session_state["metadata"]["unit"]
|
43 |
headquarter = st.session_state["metadata"]["headquarter"]
|
|
|
44 |
else:
|
45 |
company_name = None
|
46 |
sector = None
|
@@ -48,6 +49,15 @@ if "pdf_after_page_validation" in st.session_state:
|
|
48 |
currency = None
|
49 |
unit = None
|
50 |
headquarter = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
companies = list(COMPANIES.keys())
|
52 |
company_name = st.selectbox(
|
53 |
"Company name",
|
@@ -73,7 +83,9 @@ if "pdf_after_page_validation" in st.session_state:
|
|
73 |
currency = st.selectbox(
|
74 |
"Currency",
|
75 |
currencies,
|
76 |
-
index=currencies.index(currency)
|
|
|
|
|
77 |
)
|
78 |
|
79 |
units = [
|
@@ -98,6 +110,7 @@ if "pdf_after_page_validation" in st.session_state:
|
|
98 |
)
|
99 |
if submitted:
|
100 |
st.session_state["metadata"] = {
|
|
|
101 |
"company_name": company_name,
|
102 |
"sector": sector,
|
103 |
"year": year,
|
|
|
41 |
currency = st.session_state["metadata"]["currency"]
|
42 |
unit = st.session_state["metadata"]["unit"]
|
43 |
headquarter = st.session_state["metadata"]["headquarter"]
|
44 |
+
decimal_separator = st.session_state["metadata"]["separator"]
|
45 |
else:
|
46 |
company_name = None
|
47 |
sector = None
|
|
|
49 |
currency = None
|
50 |
unit = None
|
51 |
headquarter = ""
|
52 |
+
decimal_separator = ","
|
53 |
+
|
54 |
+
separator_list = [",", "."]
|
55 |
+
decimal_separator = st.selectbox(
|
56 |
+
"Decimal separator",
|
57 |
+
separator_list,
|
58 |
+
index=separator_list.index(decimal_separator),
|
59 |
+
)
|
60 |
+
|
61 |
companies = list(COMPANIES.keys())
|
62 |
company_name = st.selectbox(
|
63 |
"Company name",
|
|
|
83 |
currency = st.selectbox(
|
84 |
"Currency",
|
85 |
currencies,
|
86 |
+
index=currencies.index(currency)
|
87 |
+
if currency
|
88 |
+
else currencies.index("EUR - Euro"),
|
89 |
)
|
90 |
|
91 |
units = [
|
|
|
110 |
)
|
111 |
if submitted:
|
112 |
st.session_state["metadata"] = {
|
113 |
+
"separator": decimal_separator,
|
114 |
"company_name": company_name,
|
115 |
"sector": sector,
|
116 |
"year": year,
|
pages/5_Clean_Tables.py
CHANGED
@@ -50,12 +50,12 @@ def convert_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
50 |
for column_name in dataframe.columns:
|
51 |
try:
|
52 |
dataframe[column_name] = dataframe[column_name].astype(float)
|
53 |
-
except Exception:
|
54 |
pass
|
55 |
return dataframe
|
56 |
|
57 |
|
58 |
-
special_characters = "#&()[]
|
59 |
|
60 |
|
61 |
def style_symbol(v, props=""):
|
@@ -181,25 +181,52 @@ if (
|
|
181 |
height=900,
|
182 |
)
|
183 |
|
|
|
|
|
|
|
|
|
184 |
col7, col8, col9 = st.columns([1, 1, 1])
|
185 |
with col7:
|
186 |
total = st.checkbox(
|
187 |
"Calculate the Total of each columns, excluding the last row", value=True
|
188 |
)
|
189 |
country = st.checkbox("Activate the country filter", value=True)
|
|
|
190 |
|
191 |
with col8:
|
192 |
negativ = st.checkbox(
|
193 |
"Show the negative numbers, for each columns detected as a numerical type"
|
194 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
with col9:
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy()
|
|
|
203 |
|
204 |
if country:
|
205 |
dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply(
|
@@ -207,13 +234,74 @@ if (
|
|
207 |
)
|
208 |
|
209 |
if remove_symbols:
|
210 |
-
pattern = "
|
211 |
-
for column in dataframe.
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
215 |
dataframe = convert_dataframe(dataframe)
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
if total:
|
218 |
dataframe = convert_dataframe(dataframe)
|
219 |
new_row = dataframe.apply(column_sum, axis=0)
|
|
|
50 |
for column_name in dataframe.columns:
|
51 |
try:
|
52 |
dataframe[column_name] = dataframe[column_name].astype(float)
|
53 |
+
except Exception as e:
|
54 |
pass
|
55 |
return dataframe
|
56 |
|
57 |
|
58 |
+
special_characters = "#&()[]@©€$'R¹³²"
|
59 |
|
60 |
|
61 |
def style_symbol(v, props=""):
|
|
|
181 |
height=900,
|
182 |
)
|
183 |
|
184 |
+
st.subheader(
|
185 |
+
"Filters : ",
|
186 |
+
)
|
187 |
+
|
188 |
col7, col8, col9 = st.columns([1, 1, 1])
|
189 |
with col7:
|
190 |
total = st.checkbox(
|
191 |
"Calculate the Total of each columns, excluding the last row", value=True
|
192 |
)
|
193 |
country = st.checkbox("Activate the country filter", value=True)
|
194 |
+
decimal_cleanup = st.checkbox("Apply decimal cleanup")
|
195 |
|
196 |
with col8:
|
197 |
negativ = st.checkbox(
|
198 |
"Show the negative numbers, for each columns detected as a numerical type"
|
199 |
)
|
200 |
+
|
201 |
+
with st.container(border=True):
|
202 |
+
cleanup_rules = st.checkbox(
|
203 |
+
"Apply clean up rules : (number) mean a negative number, o-> 0, homogenization NA, ect ect "
|
204 |
+
)
|
205 |
+
if cleanup_rules:
|
206 |
+
cleanup_excluded = st.multiselect(
|
207 |
+
"exclude from filtering",
|
208 |
+
st.session_state.tables[st.session_state["algorithm_name"]].columns,
|
209 |
+
key="cleanup",
|
210 |
+
)
|
211 |
+
|
212 |
with col9:
|
213 |
+
with st.container(border=True):
|
214 |
+
symbol = st.checkbox(
|
215 |
+
"Show the cells that contain a special symbol : " + special_characters,
|
216 |
+
value=True,
|
217 |
+
)
|
218 |
+
remove_symbols = st.checkbox(
|
219 |
+
"Remove the special symbols on numeric columns"
|
220 |
+
)
|
221 |
+
if remove_symbols:
|
222 |
+
rm_symbol_excluded = st.multiselect(
|
223 |
+
"exclude from filtering",
|
224 |
+
st.session_state.tables[st.session_state["algorithm_name"]].columns,
|
225 |
+
key="rm_symbol",
|
226 |
+
)
|
227 |
|
228 |
dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy()
|
229 |
+
dataframe = convert_dataframe(dataframe)
|
230 |
|
231 |
if country:
|
232 |
dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply(
|
|
|
234 |
)
|
235 |
|
236 |
if remove_symbols:
|
237 |
+
pattern = "[" + re.escape(special_characters) + "]"
|
238 |
+
for column, dtype in dataframe.dtypes.items():
|
239 |
+
if column not in rm_symbol_excluded:
|
240 |
+
dataframe[column] = dataframe[column].apply(
|
241 |
+
lambda x: re.sub(pattern, "", str(x))
|
242 |
+
)
|
243 |
dataframe = convert_dataframe(dataframe)
|
244 |
|
245 |
+
if cleanup_rules:
|
246 |
+
for column, dtype in dataframe.dtypes.items():
|
247 |
+
if column not in cleanup_excluded:
|
248 |
+
# this is a code translated by chatgpt from Kane's R code
|
249 |
+
dataframe[column] = dataframe[column].replace(
|
250 |
+
{"^-$|^$|^ $|^N/I$|^- -$|^N/A$|^n\\.a\\.$": None}, regex=True
|
251 |
+
)
|
252 |
+
dataframe[column] = dataframe[column].replace(
|
253 |
+
{"^o$|^O$|^\\(o\\)$|^\\(O\\)$|^\\(0\\)$": "0"}, regex=True
|
254 |
+
)
|
255 |
+
|
256 |
+
if dtype == object:
|
257 |
+
dataframe[column] = dataframe[column].str.replace(
|
258 |
+
"(\\(.*\\))[:alnum:]+", "\\1", regex=True
|
259 |
+
)
|
260 |
+
dataframe[column] = dataframe[column].str.replace(
|
261 |
+
"\\([:alnum:]+$|\\)[:alnum:]+$", "", regex=True
|
262 |
+
)
|
263 |
+
dataframe[column] = dataframe[column].str.replace(
|
264 |
+
"\\([:alpha:]+\\)", "", regex=True
|
265 |
+
)
|
266 |
+
dataframe[column] = dataframe[column].str.replace(
|
267 |
+
"(.+)\\(.+\\)$", "\\1", regex=True
|
268 |
+
)
|
269 |
+
dataframe[column] = dataframe[column].str.replace(
|
270 |
+
"^\\(-(.*)\\)", "-\\1", regex=True
|
271 |
+
)
|
272 |
+
dataframe[column] = dataframe[column].str.replace(
|
273 |
+
"^\\((.*)\\)", "-\\1", regex=True
|
274 |
+
)
|
275 |
+
dataframe[column] = dataframe[column].str.replace(
|
276 |
+
"\\(.*\\)| |\\*|^-$|\\[.*\\]|^-€$", "", regex=True
|
277 |
+
)
|
278 |
+
dataframe = convert_dataframe(dataframe)
|
279 |
+
if decimal_cleanup:
|
280 |
+
decimal_separator = (
|
281 |
+
st.session_state["metadata"]["separator"]
|
282 |
+
if st.session_state["metadata"]["separator"]
|
283 |
+
else ","
|
284 |
+
)
|
285 |
+
for column, dtype in dataframe.dtypes.items():
|
286 |
+
if dtype == object:
|
287 |
+
if decimal_separator == ",":
|
288 |
+
dataframe[column] = dataframe[column].str.replace(
|
289 |
+
"\\.", "", regex=False
|
290 |
+
)
|
291 |
+
dataframe[column] = dataframe[column].str.replace(
|
292 |
+
",", ".", regex=False
|
293 |
+
)
|
294 |
+
else:
|
295 |
+
dataframe[column] = dataframe[column].str.replace(
|
296 |
+
",(.{1,2})$", ".\\1", regex=True
|
297 |
+
)
|
298 |
+
dataframe[column] = dataframe[column].str.replace(
|
299 |
+
"\\.([0-9]{3})", ",\\1", regex=True
|
300 |
+
)
|
301 |
+
dataframe[column] = dataframe[column].str.replace(
|
302 |
+
",", "", regex=False
|
303 |
+
)
|
304 |
+
|
305 |
if total:
|
306 |
dataframe = convert_dataframe(dataframe)
|
307 |
new_row = dataframe.apply(column_sum, axis=0)
|
utils.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import base64
|
|
|
2 |
from pathlib import Path
|
3 |
from typing import Any
|
4 |
|
5 |
import pandas as pd
|
6 |
import streamlit as st
|
|
|
7 |
|
8 |
|
9 |
def get_pdf_iframe(pdf_to_process: str) -> str:
|
@@ -61,3 +63,25 @@ def set_state(key: Any, value: Any) -> None:
|
|
61 |
nested_value[key_list[-1]] = value
|
62 |
else:
|
63 |
st.session_state[key] = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import base64
|
2 |
+
import logging
|
3 |
from pathlib import Path
|
4 |
from typing import Any
|
5 |
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
+
from pypdf import PdfReader
|
9 |
|
10 |
|
11 |
def get_pdf_iframe(pdf_to_process: str) -> str:
|
|
|
63 |
nested_value[key_list[-1]] = value
|
64 |
else:
|
65 |
st.session_state[key] = value
|
66 |
+
|
67 |
+
|
68 |
+
def generate_assets() -> None:
|
69 |
+
assets = {
|
70 |
+
"pagefilter": {},
|
71 |
+
"table_extractors": [],
|
72 |
+
}
|
73 |
+
|
74 |
+
# Filtering the pages
|
75 |
+
st.session_state["proc"].page_filter(
|
76 |
+
st.session_state["working_file_pdf"].name,
|
77 |
+
assets,
|
78 |
+
)
|
79 |
+
|
80 |
+
logging.info(f"Assets : {assets}")
|
81 |
+
|
82 |
+
if len(assets["pagefilter"]["selected_pages"]) == 0:
|
83 |
+
# No page has been automatically selected by the page filter
|
84 |
+
# Hence, we display the full pdf, letting the user select the pages
|
85 |
+
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
|
86 |
+
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
|
87 |
+
st.session_state["assets"] = assets
|