Spaces:
Sleeping
Sleeping
Ronan
commited on
Commit
•
ec6dd69
1
Parent(s):
5a0bc6c
feat: first commit
Browse filesfeat: add license
feat : add requirements.txt
fix: rm poetry dependency
fix:opencv version
fix requirements
add pillow-heif = "^0.15.0"
fix: sck learn
add packages.txt
rm camelot config
fix: dependency issues
fix altair version
add tesseract packages
rm tesseract-ocr-dev
fix: come back to aggrid 0.3.4
feat : update
comment cleaning part
rm dependency cleaning
add models
MAJ Hugging Face
fix: use app.py
update
UPDATE
fix: dont use app/ path
fix: path
add Extractable
fix: no ExtractTable
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +1 -0
- .streamlit/config.toml +2 -0
- LICENSE +20 -0
- README.md +3 -6
- __pycache__/menu.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +3 -0
- configs/test_full_workflow.yaml +16 -0
- configs/v0.yaml +20 -0
- country_by_country/.empty +0 -0
- country_by_country/__init__.py +21 -0
- country_by_country/__main__.py +67 -0
- country_by_country/__pycache__/__init__.cpython-310.pyc +0 -0
- country_by_country/__pycache__/__main__.cpython-310.pyc +0 -0
- country_by_country/__pycache__/dash_demo.cpython-310.pyc +0 -0
- country_by_country/__pycache__/dash_process_methods.cpython-310.pyc +0 -0
- country_by_country/__pycache__/processor.cpython-310.pyc +0 -0
- country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
- country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
- country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
- country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
- country_by_country/models/decision_tree_model.joblib +0 -0
- country_by_country/models/random_forest_country_names.pkl +0 -0
- country_by_country/models/random_forest_keywords.pkl +0 -0
- country_by_country/models/random_forest_model_high_false_positive.joblib +0 -0
- country_by_country/models/random_forest_model_low_false_positive.joblib +0 -0
- country_by_country/pagefilter/__init__.py +41 -0
- country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc +0 -0
- country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc +0 -0
- country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc +0 -0
- country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc +0 -0
- country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc +0 -0
- country_by_country/pagefilter/copy_as_is.py +51 -0
- country_by_country/pagefilter/from_filename.py +79 -0
- country_by_country/pagefilter/rf_classifier.py +153 -0
- country_by_country/processor.py +87 -0
- country_by_country/table_cleaning/__init__.py +34 -0
- country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc +0 -0
- country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc +0 -0
- country_by_country/table_cleaning/llm_cleaner.py +183 -0
- country_by_country/table_extraction/__init__.py +61 -0
- country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
- country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc +0 -0
- country_by_country/table_extraction/camelot_extractor.py +57 -0
- country_by_country/table_extraction/extract_table_api.py +63 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv*
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[client]
|
2 |
+
showSidebarNavigation = false
|
LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2015-2024 Data4Good
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
4 |
+
a copy of this software and associated documentation files (the
|
5 |
+
"Software"), to deal in the Software without restriction, including
|
6 |
+
without limitation the rights to use, copy, modify, merge, publish,
|
7 |
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8 |
+
permit persons to whom the Software is furnished to do so, subject to
|
9 |
+
the following conditions:
|
10 |
+
|
11 |
+
The above copyright notice and this permission notice shall be
|
12 |
+
included in all copies or substantial portions of the Software.
|
13 |
+
|
14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17 |
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18 |
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19 |
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20 |
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
|
5 |
-
colorTo: blue
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.32.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: TaxObservatory Demo
|
3 |
+
colorFrom: red
|
4 |
+
colorTo: green
|
|
|
5 |
sdk: streamlit
|
6 |
sdk_version: 1.32.2
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
|
|
|
__pycache__/menu.cpython-310.pyc
ADDED
Binary file (1.37 kB). View file
|
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.03 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.switch_page("pages/0_Import_File.py")
|
configs/test_full_workflow.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Full workflow
|
2 |
+
# Requires OpenAI API key and only works with table_extraction:Unstructured and r
|
3 |
+
|
4 |
+
pagefilter:
|
5 |
+
type: FromFilename
|
6 |
+
|
7 |
+
table_extraction:
|
8 |
+
- type: Unstructured
|
9 |
+
params:
|
10 |
+
pdf_image_dpi: 300
|
11 |
+
hi_res_model_name: "yolox"
|
12 |
+
|
13 |
+
table_cleaning:
|
14 |
+
- type: LLM
|
15 |
+
params:
|
16 |
+
openai_model: "gpt-4-turbo-preview"
|
configs/v0.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pagefilter:
|
2 |
+
type: RFClassifier
|
3 |
+
params:
|
4 |
+
modelfile: random_forest_model_low_false_positive.joblib
|
5 |
+
|
6 |
+
table_extraction:
|
7 |
+
- type: Camelot
|
8 |
+
params:
|
9 |
+
flavor: stream
|
10 |
+
- type: Camelot
|
11 |
+
params:
|
12 |
+
flavor: lattice
|
13 |
+
- type: Unstructured
|
14 |
+
params:
|
15 |
+
hi_res_model_name: "yolox"
|
16 |
+
pdf_image_dpi: 300
|
17 |
+
# - type: LLamaParse
|
18 |
+
# - type: UnstructuredAPI
|
19 |
+
|
20 |
+
# table_cleaning:
|
country_by_country/.empty
ADDED
File without changes
|
country_by_country/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
country_by_country/__main__.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
import logging
|
25 |
+
import pickle
|
26 |
+
import sys
|
27 |
+
from pathlib import Path
|
28 |
+
|
29 |
+
import yaml
|
30 |
+
|
31 |
+
# Local imports
|
32 |
+
from dotenv import load_dotenv
|
33 |
+
|
34 |
+
from country_by_country import processor
|
35 |
+
|
36 |
+
NUM_CLI_ARGS = 3
|
37 |
+
|
38 |
+
|
39 |
+
def process_report(config: dict, pdf_filepath: str) -> None:
|
40 |
+
# Loading API keys from .env file
|
41 |
+
load_dotenv()
|
42 |
+
|
43 |
+
proc = processor.ReportProcessor(config)
|
44 |
+
return proc.process(pdf_filepath)
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
|
49 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
|
50 |
+
|
51 |
+
if len(sys.argv) != NUM_CLI_ARGS:
|
52 |
+
logging.error("Usage : python -m country_by_country config.yaml report.pdf")
|
53 |
+
sys.exit(-1)
|
54 |
+
|
55 |
+
logging.info(f"\nLoading {sys.argv[1]}")
|
56 |
+
with Path(sys.argv[1]).open() as fh:
|
57 |
+
config = yaml.safe_load(fh)
|
58 |
+
|
59 |
+
assets = process_report(config, sys.argv[2])
|
60 |
+
|
61 |
+
# Save all the assets to disk
|
62 |
+
with Path("assets.pkl").open("wb") as fh:
|
63 |
+
pickle.dump(assets, fh)
|
64 |
+
logging.info(
|
65 |
+
"Assets dumped in assets.pkl. You can read then using : \n"
|
66 |
+
+ "pickle.load(open('assets.pkl', 'rb'))",
|
67 |
+
)
|
country_by_country/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (176 Bytes). View file
|
|
country_by_country/__pycache__/__main__.cpython-310.pyc
ADDED
Binary file (977 Bytes). View file
|
|
country_by_country/__pycache__/dash_demo.cpython-310.pyc
ADDED
Binary file (10.5 kB). View file
|
|
country_by_country/__pycache__/dash_process_methods.cpython-310.pyc
ADDED
Binary file (6.62 kB). View file
|
|
country_by_country/__pycache__/processor.cpython-310.pyc
ADDED
Binary file (1.38 kB). View file
|
|
country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (699 Bytes). View file
|
|
country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc
ADDED
Binary file (1.3 kB). View file
|
|
country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc
ADDED
Binary file (1.65 kB). View file
|
|
country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
country_by_country/models/decision_tree_model.joblib
ADDED
Binary file (5.1 kB). View file
|
|
country_by_country/models/random_forest_country_names.pkl
ADDED
Binary file (10.5 kB). View file
|
|
country_by_country/models/random_forest_keywords.pkl
ADDED
Binary file (328 Bytes). View file
|
|
country_by_country/models/random_forest_model_high_false_positive.joblib
ADDED
Binary file (21.1 kB). View file
|
|
country_by_country/models/random_forest_model_low_false_positive.joblib
ADDED
Binary file (106 kB). View file
|
|
country_by_country/pagefilter/__init__.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
|
25 |
+
# Local imports
|
26 |
+
from .copy_as_is import CopyAsIs
|
27 |
+
from .from_filename import FromFilename
|
28 |
+
from .rf_classifier import RFClassifier
|
29 |
+
|
30 |
+
|
31 |
+
def from_config(config: dict) -> CopyAsIs | FromFilename:
|
32 |
+
filter_type = config["type"]
|
33 |
+
if "params" in config:
|
34 |
+
params = config["params"]
|
35 |
+
|
36 |
+
if filter_type == "CopyAsIs":
|
37 |
+
return CopyAsIs()
|
38 |
+
elif filter_type == "FromFilename":
|
39 |
+
return FromFilename()
|
40 |
+
elif filter_type == "RFClassifier":
|
41 |
+
return RFClassifier(**params)
|
country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (630 Bytes). View file
|
|
country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc
ADDED
Binary file (1.13 kB). View file
|
|
country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc
ADDED
Binary file (777 Bytes). View file
|
|
country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc
ADDED
Binary file (1.83 kB). View file
|
|
country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc
ADDED
Binary file (5.05 kB). View file
|
|
country_by_country/pagefilter/copy_as_is.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# External imports
|
24 |
+
import pypdf
|
25 |
+
|
26 |
+
|
27 |
+
class CopyAsIs:
|
28 |
+
"""
|
29 |
+
Dummy filter just copying the source pdf to a target
|
30 |
+
temporary file
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(self) -> None:
|
34 |
+
pass
|
35 |
+
|
36 |
+
def __call__(self, pdf_filepath: str, assets: dict) -> None:
|
37 |
+
"""
|
38 |
+
Basically keeps all the pages of the original document
|
39 |
+
Writes assets:
|
40 |
+
src_pdf: the original pdf filepath
|
41 |
+
selected_pages : list of selected pages
|
42 |
+
"""
|
43 |
+
|
44 |
+
reader = pypdf.PdfReader(pdf_filepath)
|
45 |
+
n_pages = len(reader.pages)
|
46 |
+
|
47 |
+
if assets is not None:
|
48 |
+
assets["pagefilter"] = {
|
49 |
+
"src_pdf": pdf_filepath,
|
50 |
+
"selected_pages": list(range(n_pages)),
|
51 |
+
}
|
country_by_country/pagefilter/from_filename.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
from pathlib import Path
|
25 |
+
|
26 |
+
NUM_PAGE_FIELDS = 2
|
27 |
+
|
28 |
+
|
29 |
+
class FromFilename:
|
30 |
+
"""
|
31 |
+
Filtering from filename. This filter expects the filename
|
32 |
+
of the pdf contains either the page or a page range of interest
|
33 |
+
explicitely given in the filename as :
|
34 |
+
|
35 |
+
/dir/containing/the/filename_of_the_report_#1.pdf
|
36 |
+
/dif/containing/the/filename_of_the_report_#1-#2.pdf
|
37 |
+
|
38 |
+
where #1 is a single page
|
39 |
+
#1-#2 is a page range
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(self) -> None:
|
43 |
+
pass
|
44 |
+
|
45 |
+
def __call__(self, pdf_filepath: str, assets: dict) -> None:
|
46 |
+
"""
|
47 |
+
Reads and processes a pdf from its filepath
|
48 |
+
It writes the filtered pdf as a temporary pdf
|
49 |
+
The filepath of this temporary pdf is returned
|
50 |
+
|
51 |
+
Writes assets:
|
52 |
+
src_pdf: the original pdf filepath
|
53 |
+
target_pdf: the temporary target pdf filepath
|
54 |
+
selected_pages : list of selected pages
|
55 |
+
"""
|
56 |
+
|
57 |
+
# Get the page or page range from the filename
|
58 |
+
src_filename = Path(pdf_filepath).name
|
59 |
+
|
60 |
+
# We remove the extension, split on "_" and keep the last field
|
61 |
+
pagefield = src_filename[:-4].split("_")[-1]
|
62 |
+
selected_pages = []
|
63 |
+
|
64 |
+
if pagefield.isnumeric():
|
65 |
+
selected_pages = [int(pagefield) - 1]
|
66 |
+
else:
|
67 |
+
pagefields = pagefield.split("-")
|
68 |
+
if (
|
69 |
+
len(pagefields) == NUM_PAGE_FIELDS
|
70 |
+
and pagefields[0].isnumeric()
|
71 |
+
and pagefields[1].isnumeric()
|
72 |
+
):
|
73 |
+
selected_pages = list(range(int(pagefields[0]) - 1, int(pagefields[1])))
|
74 |
+
|
75 |
+
if assets is not None:
|
76 |
+
assets["pagefilter"] = {
|
77 |
+
"src_pdf": pdf_filepath,
|
78 |
+
"selected_pages": selected_pages,
|
79 |
+
}
|
country_by_country/pagefilter/rf_classifier.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard import
|
24 |
+
import pickle
|
25 |
+
import pkgutil
|
26 |
+
import tempfile
|
27 |
+
|
28 |
+
# External imports
|
29 |
+
import joblib
|
30 |
+
import numpy as np
|
31 |
+
import pypdf
|
32 |
+
|
33 |
+
|
34 |
+
class FeatureExtractor:
|
35 |
+
"""
|
36 |
+
A class to extract the features of a page as required by the random forest
|
37 |
+
classifier
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, keywords: list[str], all_country_names: list[str]) -> None:
|
41 |
+
"""
|
42 |
+
Arguments:
|
43 |
+
keywords: the keywords to count from the page text content
|
44 |
+
all_country_names: the country names/flags to count in the page content
|
45 |
+
"""
|
46 |
+
self.all_country_names = all_country_names
|
47 |
+
self.keywords = keywords
|
48 |
+
|
49 |
+
def number_country_names(self, text: str) -> int:
|
50 |
+
"""
|
51 |
+
Computes and returns the total number of occurence of any of the the
|
52 |
+
country names
|
53 |
+
"""
|
54 |
+
return sum([text.count(country) for country in self.all_country_names])
|
55 |
+
|
56 |
+
def keyword(self, text: str, keyword: str) -> int:
|
57 |
+
"""
|
58 |
+
Computes and returns the number of occurence of the specific keyword
|
59 |
+
"""
|
60 |
+
return text.count(keyword)
|
61 |
+
|
62 |
+
def __call__(self, text: str) -> np.array:
|
63 |
+
"""
|
64 |
+
Extracts the feature vector from the text
|
65 |
+
The features we extract are:
|
66 |
+
- nb_country: the total number of country names in the page
|
67 |
+
- keywords: how many times a string in the list of keywords is contained in the page
|
68 |
+
|
69 |
+
A typical list of keywords is :
|
70 |
+
["tax","countr","country by country","country-by-country","report","cbc",\
|
71 |
+
"revenu","transparen","ethic","incom","employ","benefi","asset","contrib",\
|
72 |
+
"profit","accrued","jurisdiction","sales","ebt","paid","stated","accu","tangible",\
|
73 |
+
"fte", "expense", "related","headcount","capital","turnover","retained","current",\
|
74 |
+
"plant","work","intragroup","remuneration","debt","contribution","per country"]
|
75 |
+
"""
|
76 |
+
features = [self.number_country_names(text)]
|
77 |
+
features.extend([self.keyword(text, keyword_i) for keyword_i in self.keywords])
|
78 |
+
return features
|
79 |
+
|
80 |
+
|
81 |
+
class RFClassifier:
|
82 |
+
"""
|
83 |
+
RandomForest classifier of whether a page contains a CbCR table or not
|
84 |
+
This randomforest decides from the text content of the page and is unable
|
85 |
+
to detect a page where a CbCR table would be included as an image
|
86 |
+
"""
|
87 |
+
|
88 |
+
def __init__(self, modelfile: str) -> None:
|
89 |
+
# Access the model bundled in the package
|
90 |
+
data = pkgutil.get_data(
|
91 |
+
"country_by_country",
|
92 |
+
f"models/{modelfile}",
|
93 |
+
)
|
94 |
+
keywords = pickle.loads(
|
95 |
+
pkgutil.get_data("country_by_country", "models/random_forest_keywords.pkl"),
|
96 |
+
).split(",")
|
97 |
+
|
98 |
+
all_country_names = pickle.loads(
|
99 |
+
pkgutil.get_data(
|
100 |
+
"country_by_country",
|
101 |
+
"models/random_forest_country_names.pkl",
|
102 |
+
),
|
103 |
+
)
|
104 |
+
self.feature_extractor = FeatureExtractor(keywords, all_country_names)
|
105 |
+
# Unpack the data in a temporary file that joblib can then load
|
106 |
+
with tempfile.NamedTemporaryFile("wb", delete=False) as fp:
|
107 |
+
fp.write(data)
|
108 |
+
fp.close()
|
109 |
+
self.clf = joblib.load(fp.name)
|
110 |
+
|
111 |
+
def __call__(self, pdf_filepath: str, assets: dict) -> None:
|
112 |
+
"""
|
113 |
+
Reads and processes a pdf from its filepath
|
114 |
+
It writes the filtered pdf as a temporary pdf
|
115 |
+
The filepath of this temporary pdf is returned
|
116 |
+
|
117 |
+
Writes assets:
|
118 |
+
src_pdf: the original pdf filepath
|
119 |
+
target_pdf: the temporary target pdf filepath
|
120 |
+
selected_pages : List of int
|
121 |
+
"""
|
122 |
+
|
123 |
+
reader = pypdf.PdfReader(pdf_filepath)
|
124 |
+
|
125 |
+
# Extract the features from all the pages
|
126 |
+
page_features = []
|
127 |
+
for p in reader.pages:
|
128 |
+
content = p.extract_text().lower()
|
129 |
+
page_features.append(self.feature_extractor(content))
|
130 |
+
|
131 |
+
# features is now num_pages x num_features_per_page
|
132 |
+
page_features = np.array(page_features)
|
133 |
+
n_pages, n_features_per_page = page_features.shape
|
134 |
+
|
135 |
+
# Concatenate the features of the previous page and the next page
|
136 |
+
# the random forest expects
|
137 |
+
# [features_page_{i-1}, features_page_{i}, features_pages_{i+1}]
|
138 |
+
features = np.zeros((n_pages, 3 * n_features_per_page))
|
139 |
+
features[1:, :n_features_per_page] = page_features[:-1]
|
140 |
+
features[:, n_features_per_page:-n_features_per_page] = page_features
|
141 |
+
features[:-1, -n_features_per_page:] = page_features[1:]
|
142 |
+
|
143 |
+
# Performs the prediction
|
144 |
+
predictions = self.clf.predict(features)
|
145 |
+
|
146 |
+
# And now we keep only the pages that have been selected
|
147 |
+
selected_pages = [ip for ip, keep_p in enumerate(predictions) if keep_p]
|
148 |
+
|
149 |
+
if assets is not None:
|
150 |
+
assets["pagefilter"] = {
|
151 |
+
"src_pdf": pdf_filepath,
|
152 |
+
"selected_pages": selected_pages,
|
153 |
+
}
|
country_by_country/processor.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
import logging
|
25 |
+
|
26 |
+
# Local imports
|
27 |
+
from . import pagefilter, table_extraction
|
28 |
+
from .utils.utils import keep_pages
|
29 |
+
|
30 |
+
|
31 |
+
class ReportProcessor:
|
32 |
+
def __init__(self, config: dict) -> None:
|
33 |
+
# Report filter
|
34 |
+
self.page_filter = pagefilter.from_config(config["pagefilter"])
|
35 |
+
|
36 |
+
self.table_extractors = []
|
37 |
+
self.table_cleaners = []
|
38 |
+
|
39 |
+
# Tables extraction
|
40 |
+
if "table_extraction" in config:
|
41 |
+
table_extractors = config["table_extraction"]
|
42 |
+
self.table_extractors = [
|
43 |
+
table_extraction.from_config(name) for name in table_extractors
|
44 |
+
]
|
45 |
+
|
46 |
+
# Table cleaning & reformatting
|
47 |
+
# We can do this step only if we had table extraction algorithms
|
48 |
+
# otherwise, the assets will not be available
|
49 |
+
#if "table_cleaning" in config:
|
50 |
+
# table_cleaners = config["table_cleaning"]
|
51 |
+
# self.table_cleaners = [
|
52 |
+
# table_cleaning.from_config(name) for name in table_cleaners
|
53 |
+
# ]
|
54 |
+
|
55 |
+
def process(self, pdf_filepath: str) -> dict:
|
56 |
+
logging.info(f"Processing {pdf_filepath}")
|
57 |
+
|
58 |
+
assets = {
|
59 |
+
"pagefilter": {},
|
60 |
+
"table_extractors": [],
|
61 |
+
"table_cleaners": [],
|
62 |
+
}
|
63 |
+
|
64 |
+
# Identifying the pages to extract
|
65 |
+
self.page_filter(pdf_filepath, assets)
|
66 |
+
|
67 |
+
# Now that we identified the pages to be extracted, we extract them
|
68 |
+
# Note, in a GUI, we could ask the user to the change the content of
|
69 |
+
# assets["pagefilter"]["selected_pages"] before selecting the pages
|
70 |
+
pdf_to_process = keep_pages(
|
71 |
+
pdf_filepath,
|
72 |
+
assets["pagefilter"]["selected_pages"],
|
73 |
+
)
|
74 |
+
|
75 |
+
# Process the selected pages to detect the tables and extract
|
76 |
+
# their contents
|
77 |
+
for table_extractor in self.table_extractors:
|
78 |
+
new_asset = table_extractor(pdf_to_process)
|
79 |
+
assets["table_extractors"].append(new_asset)
|
80 |
+
|
81 |
+
# Give the parsed content to the cleaner stage for getting organized data
|
82 |
+
#for table_cleaner in self.table_cleaners:
|
83 |
+
# for asset in assets["table_extractors"]:
|
84 |
+
# new_asset = table_cleaner(asset)
|
85 |
+
# assets["table_cleaners"].append(new_asset)
|
86 |
+
|
87 |
+
return assets
|
country_by_country/table_cleaning/__init__.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Local imports
|
24 |
+
from .llm_cleaner import LLMCleaner
|
25 |
+
|
26 |
+
|
27 |
+
def from_config(config: dict) -> LLMCleaner:
|
28 |
+
extractor_type = config["type"]
|
29 |
+
extractor_params = {}
|
30 |
+
if "params" in config:
|
31 |
+
extractor_params = config["params"]
|
32 |
+
if extractor_type == "LLM":
|
33 |
+
return LLMCleaner(**extractor_params)
|
34 |
+
return None
|
country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (497 Bytes). View file
|
|
country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc
ADDED
Binary file (5.25 kB). View file
|
|
country_by_country/table_cleaning/llm_cleaner.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
import logging
|
25 |
+
import uuid
|
26 |
+
|
27 |
+
import pandas as pd
|
28 |
+
|
29 |
+
# External imports
|
30 |
+
from IPython.display import display
|
31 |
+
from langchain.prompts import PromptTemplate
|
32 |
+
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
|
33 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
34 |
+
from langchain_openai import ChatOpenAI
|
35 |
+
|
36 |
+
from country_by_country.utils import constants
|
37 |
+
|
38 |
+
|
39 |
+
class LLMCleaner:
|
40 |
+
def __init__(self, **kwargs: dict) -> None:
|
41 |
+
"""
|
42 |
+
Builds a table cleaner, by extracting clean data from tables
|
43 |
+
extracted during table extraction stage.
|
44 |
+
The kwargs given to the constructor are directly propagated
|
45 |
+
to the LLMCleaner constructor.
|
46 |
+
You are free to define any parameter LLMCleaner recognizes.
|
47 |
+
"""
|
48 |
+
self.kwargs = kwargs
|
49 |
+
self.type = "llm_cleaner"
|
50 |
+
self.openai_model = self.kwargs["openai_model"]
|
51 |
+
|
52 |
+
def __call__(self, asset: dict) -> dict:
|
53 |
+
logging.info("\nKicking off cleaning stage...")
|
54 |
+
logging.info(f"Cleaning type: {self.type}, with params: {self.kwargs}")
|
55 |
+
logging.info(
|
56 |
+
f"Input extraction type: {asset['type']}, with params: {asset['params']}",
|
57 |
+
)
|
58 |
+
|
59 |
+
# Extract tables from previous stage
|
60 |
+
tables = asset["tables"]
|
61 |
+
|
62 |
+
logging.info(f"Pulling {len(tables)} tables from extraction stage")
|
63 |
+
|
64 |
+
# Convert tables to html to add to LLM prompt
|
65 |
+
html_tables = [table.to_html() for table in tables]
|
66 |
+
|
67 |
+
# Define our LLM model
|
68 |
+
model = ChatOpenAI(temperature=0, model=self.openai_model)
|
69 |
+
|
70 |
+
# ---------- CHAIN 1/2 - Pull countries from each table ----------
|
71 |
+
logging.info("Starting chain 1/2: extracting country names from tables")
|
72 |
+
|
73 |
+
# Output should have this model (a list of country names)
|
74 |
+
class CountryNames(BaseModel):
|
75 |
+
country_names: list[str] = Field(
|
76 |
+
description="Exhaustive list of countries with financial data in the table",
|
77 |
+
enum=constants.COUNTRIES,
|
78 |
+
)
|
79 |
+
|
80 |
+
# Output should be a JSON with above schema
|
81 |
+
parser1 = JsonOutputParser(pydantic_object=CountryNames)
|
82 |
+
|
83 |
+
# Prompt includes one extracted table and some JSON output formatting instructions
|
84 |
+
prompt1 = PromptTemplate(
|
85 |
+
template="Extract an exhaustive list of countries from the following table "
|
86 |
+
+ "in html format:\n{table}\n{format_instructions}",
|
87 |
+
input_variables=["table"],
|
88 |
+
partial_variables={
|
89 |
+
"format_instructions": parser1.get_format_instructions(),
|
90 |
+
},
|
91 |
+
)
|
92 |
+
|
93 |
+
# Chain
|
94 |
+
chain1 = {"table": lambda x: x} | prompt1 | model | parser1
|
95 |
+
|
96 |
+
# Run it
|
97 |
+
responses1 = chain1.batch(html_tables, {"max_concurrency": 4})
|
98 |
+
|
99 |
+
# Extract country lists from responses
|
100 |
+
country_lists = [resp["country_names"] for resp in responses1]
|
101 |
+
|
102 |
+
# ---------- CHAIN 2/2 - Pull financial data for each country ----------
|
103 |
+
logging.info("Starting chain 2/2: extracting financial data from tables")
|
104 |
+
|
105 |
+
# Define country data model
|
106 |
+
class Country(BaseModel):
|
107 |
+
"""Financial data about a country"""
|
108 |
+
|
109 |
+
jur_name: str = Field(..., description="Name of the country")
|
110 |
+
total_revenues: float | None = Field(None, description="Total revenues")
|
111 |
+
profit_before_tax: float | None = Field(
|
112 |
+
None,
|
113 |
+
description="Amount of profit (or loss) before tax",
|
114 |
+
)
|
115 |
+
tax_paid: float | None = Field(None, description="Income tax paid")
|
116 |
+
tax_accrued: float | None = Field(None, description="Accrued tax")
|
117 |
+
employees: float | None = Field(None, description="Number of employees")
|
118 |
+
stated_capital: float | None = Field(None, description="Stated capital")
|
119 |
+
accumulated_earnings: float | None = Field(
|
120 |
+
None,
|
121 |
+
description="Accumulated earnings",
|
122 |
+
)
|
123 |
+
tangible_assets: float | None = Field(
|
124 |
+
None,
|
125 |
+
description="Tangible assets other than cash and cash equivalent",
|
126 |
+
)
|
127 |
+
|
128 |
+
# Output should have this model (a list of country objects)
|
129 |
+
class Countries(BaseModel):
|
130 |
+
"""Extracting financial data for each country"""
|
131 |
+
|
132 |
+
countries: list[Country]
|
133 |
+
|
134 |
+
# Output should be a JSON with above schema
|
135 |
+
parser2 = PydanticOutputParser(pydantic_object=Countries)
|
136 |
+
|
137 |
+
# Prompt includes one extracted table and some JSON output formatting instructions
|
138 |
+
template = (
|
139 |
+
"""You are an assistant tasked with extracting financial """
|
140 |
+
+ """data about {country_list} from the following table in html format:\n
|
141 |
+
{table}\n
|
142 |
+
{format_instructions}
|
143 |
+
"""
|
144 |
+
)
|
145 |
+
|
146 |
+
# Set up prompt
|
147 |
+
prompt = PromptTemplate.from_template(
|
148 |
+
template,
|
149 |
+
partial_variables={
|
150 |
+
"format_instructions": parser2.get_format_instructions(),
|
151 |
+
},
|
152 |
+
)
|
153 |
+
|
154 |
+
# Chain
|
155 |
+
chain2 = (
|
156 |
+
{"table": lambda x: x[0], "country_list": lambda x: x[1]}
|
157 |
+
| prompt
|
158 |
+
| model.with_structured_output(Countries)
|
159 |
+
)
|
160 |
+
|
161 |
+
# Run it
|
162 |
+
responses2 = chain2.batch(
|
163 |
+
list(zip(html_tables, country_lists, strict=True)),
|
164 |
+
{"max_concurrency": 4},
|
165 |
+
)
|
166 |
+
|
167 |
+
# Merge the tables into one dataframe
|
168 |
+
df = pd.concat(
|
169 |
+
[pd.json_normalize(resp.dict()["countries"]) for resp in responses2],
|
170 |
+
).reset_index(drop=True)
|
171 |
+
|
172 |
+
# Display
|
173 |
+
display(df)
|
174 |
+
|
175 |
+
# Create asset
|
176 |
+
new_asset = {
|
177 |
+
"id": uuid.uuid4(),
|
178 |
+
"type": self.type,
|
179 |
+
"params": self.kwargs,
|
180 |
+
"table": df,
|
181 |
+
}
|
182 |
+
|
183 |
+
return new_asset
|
country_by_country/table_extraction/__init__.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Local imports
|
24 |
+
import logging
|
25 |
+
import sys
|
26 |
+
|
27 |
+
from .camelot_extractor import Camelot
|
28 |
+
from .from_csv import FromCSV
|
29 |
+
from .llama_parse_extractor import LlamaParseExtractor
|
30 |
+
from .unstructured import Unstructured
|
31 |
+
from .unstructured_api import UnstructuredAPI
|
32 |
+
|
33 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
|
34 |
+
|
35 |
+
|
36 |
+
def from_config(config: dict) -> Camelot:
|
37 |
+
extractor_type = config["type"]
|
38 |
+
extractor_params = {}
|
39 |
+
if "params" in config:
|
40 |
+
extractor_params = config["params"]
|
41 |
+
if extractor_type == "Camelot":
|
42 |
+
return Camelot(**extractor_params)
|
43 |
+
elif extractor_type == "FromCSV":
|
44 |
+
return FromCSV(**extractor_params)
|
45 |
+
elif extractor_type == "Unstructured":
|
46 |
+
return Unstructured(**extractor_params)
|
47 |
+
elif extractor_type == "UnstructuredAPI":
|
48 |
+
return UnstructuredAPI(**extractor_params)
|
49 |
+
elif extractor_type == "LlamaParse":
|
50 |
+
return LlamaParseExtractor(**extractor_params)
|
51 |
+
elif extractor_type == "ExtractTableAPI":
|
52 |
+
# This is for legacy support
|
53 |
+
# In order to be able to use ExtractTable
|
54 |
+
# for benchmarking
|
55 |
+
# Note: ExtractTable-py is not maintained anymore
|
56 |
+
# This is the reason why this case is handled in a specific way
|
57 |
+
from .extract_table_api import ExtractTableAPI
|
58 |
+
|
59 |
+
return ExtractTableAPI(**extractor_params)
|
60 |
+
else:
|
61 |
+
logging.info(f"There are no extractors of the type : {extractor_type}")
|
country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (1.16 kB). View file
|
|
country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc
ADDED
Binary file (1.3 kB). View file
|
|
country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc
ADDED
Binary file (1.69 kB). View file
|
|
country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc
ADDED
Binary file (1.24 kB). View file
|
|
country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc
ADDED
Binary file (1.89 kB). View file
|
|
country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc
ADDED
Binary file (2.24 kB). View file
|
|
country_by_country/table_extraction/camelot_extractor.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
import logging
|
25 |
+
import uuid
|
26 |
+
|
27 |
+
# External imports
|
28 |
+
import camelot
|
29 |
+
|
30 |
+
|
31 |
+
class Camelot:
|
32 |
+
def __init__(self, flavor: str) -> None:
|
33 |
+
self.flavor = flavor
|
34 |
+
self.type = "camelot"
|
35 |
+
|
36 |
+
def __call__(self, pdf_filepath: str) -> dict:
|
37 |
+
"""
|
38 |
+
Returns asset that contain:
|
39 |
+
tables: a list of pandas dataframe of the parsed tables
|
40 |
+
"""
|
41 |
+
logging.info("\nKicking off extraction stage...")
|
42 |
+
logging.info(f"Extraction type: {self.type}, with params: {self.flavor}")
|
43 |
+
|
44 |
+
tables = camelot.read_pdf(pdf_filepath, flavor=self.flavor)
|
45 |
+
|
46 |
+
# Write the parsed tables into the assets
|
47 |
+
tables_list = [t.df for t in tables]
|
48 |
+
|
49 |
+
# Create asset
|
50 |
+
new_asset = {
|
51 |
+
"id": uuid.uuid4(),
|
52 |
+
"type": "camelot",
|
53 |
+
"params": {"flavor": self.flavor},
|
54 |
+
"tables": tables_list,
|
55 |
+
}
|
56 |
+
|
57 |
+
return new_asset
|
country_by_country/table_extraction/extract_table_api.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024 dataforgood
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in all
|
13 |
+
# copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
# Standard imports
|
24 |
+
import os
|
25 |
+
import uuid
|
26 |
+
|
27 |
+
# External imports
|
28 |
+
try:
|
29 |
+
from ExtractTable import ExtractTable
|
30 |
+
except ImportError as e:
|
31 |
+
|
32 |
+
class ExtractTableModuleException(Exception):
|
33 |
+
def __init__(self) -> None:
|
34 |
+
super().__init__("You must install ExtractTable : pip install ExtractTable")
|
35 |
+
|
36 |
+
raise ExtractTableModuleException() from e
|
37 |
+
|
38 |
+
|
39 |
+
class ExtractTableAPI:
|
40 |
+
def __init__(self) -> None:
|
41 |
+
api_key = os.getenv("EXTRACT_TABLE_API_KEY")
|
42 |
+
self.extract_table = ExtractTable(api_key)
|
43 |
+
|
44 |
+
def __call__(self, pdf_filepath: str) -> None:
|
45 |
+
"""
|
46 |
+
Writes assets:
|
47 |
+
ntables: the number of detected tables
|
48 |
+
tables: a list of pandas dataframe of the parsed tables
|
49 |
+
"""
|
50 |
+
tables_list = self.extract_table.process_file(
|
51 |
+
filepath=pdf_filepath,
|
52 |
+
pages="all",
|
53 |
+
output_format="df",
|
54 |
+
)
|
55 |
+
|
56 |
+
# Create asset
|
57 |
+
new_asset = {
|
58 |
+
"id": uuid.uuid4(),
|
59 |
+
"type": "ExtractTableAPI",
|
60 |
+
"tables": tables_list,
|
61 |
+
}
|
62 |
+
|
63 |
+
return new_asset
|