Ronan commited on
Commit
ec6dd69
1 Parent(s): 5a0bc6c

feat: first commit

Browse files

feat: add license

feat : add requirements.txt

fix: rm poetry dependency

fix:opencv version

fix requirements

add pillow-heif = "^0.15.0"

fix: sck learn

add packages.txt

rm camelot config

fix: dependency issues

fix altair version

add tesseract packages

rm tesseract-ocr-dev

fix: come back to aggrid 0.3.4

feat : update

comment cleaning part

rm dependency cleaning

add models

MAJ Hugging Face

fix: use app.py

update

UPDATE

fix: dont use app/ path

fix: path

add Extractable

fix: no ExtractTable

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -0
  2. .streamlit/config.toml +2 -0
  3. LICENSE +20 -0
  4. README.md +3 -6
  5. __pycache__/menu.cpython-310.pyc +0 -0
  6. __pycache__/utils.cpython-310.pyc +0 -0
  7. app.py +3 -0
  8. configs/test_full_workflow.yaml +16 -0
  9. configs/v0.yaml +20 -0
  10. country_by_country/.empty +0 -0
  11. country_by_country/__init__.py +21 -0
  12. country_by_country/__main__.py +67 -0
  13. country_by_country/__pycache__/__init__.cpython-310.pyc +0 -0
  14. country_by_country/__pycache__/__main__.cpython-310.pyc +0 -0
  15. country_by_country/__pycache__/dash_demo.cpython-310.pyc +0 -0
  16. country_by_country/__pycache__/dash_process_methods.cpython-310.pyc +0 -0
  17. country_by_country/__pycache__/processor.cpython-310.pyc +0 -0
  18. country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
  19. country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
  20. country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
  21. country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
  22. country_by_country/models/decision_tree_model.joblib +0 -0
  23. country_by_country/models/random_forest_country_names.pkl +0 -0
  24. country_by_country/models/random_forest_keywords.pkl +0 -0
  25. country_by_country/models/random_forest_model_high_false_positive.joblib +0 -0
  26. country_by_country/models/random_forest_model_low_false_positive.joblib +0 -0
  27. country_by_country/pagefilter/__init__.py +41 -0
  28. country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc +0 -0
  29. country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc +0 -0
  30. country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc +0 -0
  31. country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc +0 -0
  32. country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc +0 -0
  33. country_by_country/pagefilter/copy_as_is.py +51 -0
  34. country_by_country/pagefilter/from_filename.py +79 -0
  35. country_by_country/pagefilter/rf_classifier.py +153 -0
  36. country_by_country/processor.py +87 -0
  37. country_by_country/table_cleaning/__init__.py +34 -0
  38. country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc +0 -0
  39. country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc +0 -0
  40. country_by_country/table_cleaning/llm_cleaner.py +183 -0
  41. country_by_country/table_extraction/__init__.py +61 -0
  42. country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
  43. country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
  44. country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc +0 -0
  45. country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc +0 -0
  46. country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
  47. country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
  48. country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc +0 -0
  49. country_by_country/table_extraction/camelot_extractor.py +57 -0
  50. country_by_country/table_extraction/extract_table_api.py +63 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv*
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [client]
2
+ showSidebarNavigation = false
LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2015-2024 Data4Good
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md CHANGED
@@ -1,12 +1,9 @@
1
  ---
2
- title: Taxobservatory Demo
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.32.2
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: TaxObservatory Demo
3
+ colorFrom: red
4
+ colorTo: green
 
5
  sdk: streamlit
6
  sdk_version: 1.32.2
7
  app_file: app.py
8
  pinned: false
9
  ---
 
 
__pycache__/menu.cpython-310.pyc ADDED
Binary file (1.37 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.03 kB). View file
 
app.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.switch_page("pages/0_Import_File.py")
configs/test_full_workflow.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Full workflow
2
+ # Requires OpenAI API key and only works with table_extraction:Unstructured and r
3
+
4
+ pagefilter:
5
+ type: FromFilename
6
+
7
+ table_extraction:
8
+ - type: Unstructured
9
+ params:
10
+ pdf_image_dpi: 300
11
+ hi_res_model_name: "yolox"
12
+
13
+ table_cleaning:
14
+ - type: LLM
15
+ params:
16
+ openai_model: "gpt-4-turbo-preview"
configs/v0.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pagefilter:
2
+ type: RFClassifier
3
+ params:
4
+ modelfile: random_forest_model_low_false_positive.joblib
5
+
6
+ table_extraction:
7
+ - type: Camelot
8
+ params:
9
+ flavor: stream
10
+ - type: Camelot
11
+ params:
12
+ flavor: lattice
13
+ - type: Unstructured
14
+ params:
15
+ hi_res_model_name: "yolox"
16
+ pdf_image_dpi: 300
17
+ # - type: LLamaParse
18
+ # - type: UnstructuredAPI
19
+
20
+ # table_cleaning:
country_by_country/.empty ADDED
File without changes
country_by_country/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
country_by_country/__main__.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ import logging
25
+ import pickle
26
+ import sys
27
+ from pathlib import Path
28
+
29
+ import yaml
30
+
31
+ # Local imports
32
+ from dotenv import load_dotenv
33
+
34
+ from country_by_country import processor
35
+
36
+ NUM_CLI_ARGS = 3
37
+
38
+
39
+ def process_report(config: dict, pdf_filepath: str) -> None:
40
+ # Loading API keys from .env file
41
+ load_dotenv()
42
+
43
+ proc = processor.ReportProcessor(config)
44
+ return proc.process(pdf_filepath)
45
+
46
+
47
+ if __name__ == "__main__":
48
+
49
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
50
+
51
+ if len(sys.argv) != NUM_CLI_ARGS:
52
+ logging.error("Usage : python -m country_by_country config.yaml report.pdf")
53
+ sys.exit(-1)
54
+
55
+ logging.info(f"\nLoading {sys.argv[1]}")
56
+ with Path(sys.argv[1]).open() as fh:
57
+ config = yaml.safe_load(fh)
58
+
59
+ assets = process_report(config, sys.argv[2])
60
+
61
+ # Save all the assets to disk
62
+ with Path("assets.pkl").open("wb") as fh:
63
+ pickle.dump(assets, fh)
64
+ logging.info(
65
+ "Assets dumped in assets.pkl. You can read then using : \n"
66
+ + "pickle.load(open('assets.pkl', 'rb'))",
67
+ )
country_by_country/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (176 Bytes). View file
 
country_by_country/__pycache__/__main__.cpython-310.pyc ADDED
Binary file (977 Bytes). View file
 
country_by_country/__pycache__/dash_demo.cpython-310.pyc ADDED
Binary file (10.5 kB). View file
 
country_by_country/__pycache__/dash_process_methods.cpython-310.pyc ADDED
Binary file (6.62 kB). View file
 
country_by_country/__pycache__/processor.cpython-310.pyc ADDED
Binary file (1.38 kB). View file
 
country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (699 Bytes). View file
 
country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc ADDED
Binary file (1.3 kB). View file
 
country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc ADDED
Binary file (1.65 kB). View file
 
country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
country_by_country/models/decision_tree_model.joblib ADDED
Binary file (5.1 kB). View file
 
country_by_country/models/random_forest_country_names.pkl ADDED
Binary file (10.5 kB). View file
 
country_by_country/models/random_forest_keywords.pkl ADDED
Binary file (328 Bytes). View file
 
country_by_country/models/random_forest_model_high_false_positive.joblib ADDED
Binary file (21.1 kB). View file
 
country_by_country/models/random_forest_model_low_false_positive.joblib ADDED
Binary file (106 kB). View file
 
country_by_country/pagefilter/__init__.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+
25
+ # Local imports
26
+ from .copy_as_is import CopyAsIs
27
+ from .from_filename import FromFilename
28
+ from .rf_classifier import RFClassifier
29
+
30
+
31
+ def from_config(config: dict) -> CopyAsIs | FromFilename:
32
+ filter_type = config["type"]
33
+ if "params" in config:
34
+ params = config["params"]
35
+
36
+ if filter_type == "CopyAsIs":
37
+ return CopyAsIs()
38
+ elif filter_type == "FromFilename":
39
+ return FromFilename()
40
+ elif filter_type == "RFClassifier":
41
+ return RFClassifier(**params)
country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (630 Bytes). View file
 
country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc ADDED
Binary file (1.13 kB). View file
 
country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc ADDED
Binary file (777 Bytes). View file
 
country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc ADDED
Binary file (1.83 kB). View file
 
country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc ADDED
Binary file (5.05 kB). View file
 
country_by_country/pagefilter/copy_as_is.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # External imports
24
+ import pypdf
25
+
26
+
27
+ class CopyAsIs:
28
+ """
29
+ Dummy filter just copying the source pdf to a target
30
+ temporary file
31
+ """
32
+
33
+ def __init__(self) -> None:
34
+ pass
35
+
36
+ def __call__(self, pdf_filepath: str, assets: dict) -> None:
37
+ """
38
+ Basically keeps all the pages of the original document
39
+ Writes assets:
40
+ src_pdf: the original pdf filepath
41
+ selected_pages : list of selected pages
42
+ """
43
+
44
+ reader = pypdf.PdfReader(pdf_filepath)
45
+ n_pages = len(reader.pages)
46
+
47
+ if assets is not None:
48
+ assets["pagefilter"] = {
49
+ "src_pdf": pdf_filepath,
50
+ "selected_pages": list(range(n_pages)),
51
+ }
country_by_country/pagefilter/from_filename.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ from pathlib import Path
25
+
26
+ NUM_PAGE_FIELDS = 2
27
+
28
+
29
+ class FromFilename:
30
+ """
31
+ Filtering from filename. This filter expects the filename
32
+ of the pdf contains either the page or a page range of interest
33
+ explicitely given in the filename as :
34
+
35
+ /dir/containing/the/filename_of_the_report_#1.pdf
36
+ /dif/containing/the/filename_of_the_report_#1-#2.pdf
37
+
38
+ where #1 is a single page
39
+ #1-#2 is a page range
40
+ """
41
+
42
+ def __init__(self) -> None:
43
+ pass
44
+
45
+ def __call__(self, pdf_filepath: str, assets: dict) -> None:
46
+ """
47
+ Reads and processes a pdf from its filepath
48
+ It writes the filtered pdf as a temporary pdf
49
+ The filepath of this temporary pdf is returned
50
+
51
+ Writes assets:
52
+ src_pdf: the original pdf filepath
53
+ target_pdf: the temporary target pdf filepath
54
+ selected_pages : list of selected pages
55
+ """
56
+
57
+ # Get the page or page range from the filename
58
+ src_filename = Path(pdf_filepath).name
59
+
60
+ # We remove the extension, split on "_" and keep the last field
61
+ pagefield = src_filename[:-4].split("_")[-1]
62
+ selected_pages = []
63
+
64
+ if pagefield.isnumeric():
65
+ selected_pages = [int(pagefield) - 1]
66
+ else:
67
+ pagefields = pagefield.split("-")
68
+ if (
69
+ len(pagefields) == NUM_PAGE_FIELDS
70
+ and pagefields[0].isnumeric()
71
+ and pagefields[1].isnumeric()
72
+ ):
73
+ selected_pages = list(range(int(pagefields[0]) - 1, int(pagefields[1])))
74
+
75
+ if assets is not None:
76
+ assets["pagefilter"] = {
77
+ "src_pdf": pdf_filepath,
78
+ "selected_pages": selected_pages,
79
+ }
country_by_country/pagefilter/rf_classifier.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard import
24
+ import pickle
25
+ import pkgutil
26
+ import tempfile
27
+
28
+ # External imports
29
+ import joblib
30
+ import numpy as np
31
+ import pypdf
32
+
33
+
34
+ class FeatureExtractor:
35
+ """
36
+ A class to extract the features of a page as required by the random forest
37
+ classifier
38
+ """
39
+
40
+ def __init__(self, keywords: list[str], all_country_names: list[str]) -> None:
41
+ """
42
+ Arguments:
43
+ keywords: the keywords to count from the page text content
44
+ all_country_names: the country names/flags to count in the page content
45
+ """
46
+ self.all_country_names = all_country_names
47
+ self.keywords = keywords
48
+
49
+ def number_country_names(self, text: str) -> int:
50
+ """
51
+ Computes and returns the total number of occurence of any of the the
52
+ country names
53
+ """
54
+ return sum([text.count(country) for country in self.all_country_names])
55
+
56
+ def keyword(self, text: str, keyword: str) -> int:
57
+ """
58
+ Computes and returns the number of occurence of the specific keyword
59
+ """
60
+ return text.count(keyword)
61
+
62
+ def __call__(self, text: str) -> np.array:
63
+ """
64
+ Extracts the feature vector from the text
65
+ The features we extract are:
66
+ - nb_country: the total number of country names in the page
67
+ - keywords: how many times a string in the list of keywords is contained in the page
68
+
69
+ A typical list of keywords is :
70
+ ["tax","countr","country by country","country-by-country","report","cbc",\
71
+ "revenu","transparen","ethic","incom","employ","benefi","asset","contrib",\
72
+ "profit","accrued","jurisdiction","sales","ebt","paid","stated","accu","tangible",\
73
+ "fte", "expense", "related","headcount","capital","turnover","retained","current",\
74
+ "plant","work","intragroup","remuneration","debt","contribution","per country"]
75
+ """
76
+ features = [self.number_country_names(text)]
77
+ features.extend([self.keyword(text, keyword_i) for keyword_i in self.keywords])
78
+ return features
79
+
80
+
81
+ class RFClassifier:
82
+ """
83
+ RandomForest classifier of whether a page contains a CbCR table or not
84
+ This randomforest decides from the text content of the page and is unable
85
+ to detect a page where a CbCR table would be included as an image
86
+ """
87
+
88
+ def __init__(self, modelfile: str) -> None:
89
+ # Access the model bundled in the package
90
+ data = pkgutil.get_data(
91
+ "country_by_country",
92
+ f"models/{modelfile}",
93
+ )
94
+ keywords = pickle.loads(
95
+ pkgutil.get_data("country_by_country", "models/random_forest_keywords.pkl"),
96
+ ).split(",")
97
+
98
+ all_country_names = pickle.loads(
99
+ pkgutil.get_data(
100
+ "country_by_country",
101
+ "models/random_forest_country_names.pkl",
102
+ ),
103
+ )
104
+ self.feature_extractor = FeatureExtractor(keywords, all_country_names)
105
+ # Unpack the data in a temporary file that joblib can then load
106
+ with tempfile.NamedTemporaryFile("wb", delete=False) as fp:
107
+ fp.write(data)
108
+ fp.close()
109
+ self.clf = joblib.load(fp.name)
110
+
111
+ def __call__(self, pdf_filepath: str, assets: dict) -> None:
112
+ """
113
+ Reads and processes a pdf from its filepath
114
+ It writes the filtered pdf as a temporary pdf
115
+ The filepath of this temporary pdf is returned
116
+
117
+ Writes assets:
118
+ src_pdf: the original pdf filepath
119
+ target_pdf: the temporary target pdf filepath
120
+ selected_pages : List of int
121
+ """
122
+
123
+ reader = pypdf.PdfReader(pdf_filepath)
124
+
125
+ # Extract the features from all the pages
126
+ page_features = []
127
+ for p in reader.pages:
128
+ content = p.extract_text().lower()
129
+ page_features.append(self.feature_extractor(content))
130
+
131
+ # features is now num_pages x num_features_per_page
132
+ page_features = np.array(page_features)
133
+ n_pages, n_features_per_page = page_features.shape
134
+
135
+ # Concatenate the features of the previous page and the next page
136
+ # the random forest expects
137
+ # [features_page_{i-1}, features_page_{i}, features_pages_{i+1}]
138
+ features = np.zeros((n_pages, 3 * n_features_per_page))
139
+ features[1:, :n_features_per_page] = page_features[:-1]
140
+ features[:, n_features_per_page:-n_features_per_page] = page_features
141
+ features[:-1, -n_features_per_page:] = page_features[1:]
142
+
143
+ # Performs the prediction
144
+ predictions = self.clf.predict(features)
145
+
146
+ # And now we keep only the pages that have been selected
147
+ selected_pages = [ip for ip, keep_p in enumerate(predictions) if keep_p]
148
+
149
+ if assets is not None:
150
+ assets["pagefilter"] = {
151
+ "src_pdf": pdf_filepath,
152
+ "selected_pages": selected_pages,
153
+ }
country_by_country/processor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ import logging
25
+
26
+ # Local imports
27
+ from . import pagefilter, table_extraction
28
+ from .utils.utils import keep_pages
29
+
30
+
31
+ class ReportProcessor:
32
+ def __init__(self, config: dict) -> None:
33
+ # Report filter
34
+ self.page_filter = pagefilter.from_config(config["pagefilter"])
35
+
36
+ self.table_extractors = []
37
+ self.table_cleaners = []
38
+
39
+ # Tables extraction
40
+ if "table_extraction" in config:
41
+ table_extractors = config["table_extraction"]
42
+ self.table_extractors = [
43
+ table_extraction.from_config(name) for name in table_extractors
44
+ ]
45
+
46
+ # Table cleaning & reformatting
47
+ # We can do this step only if we had table extraction algorithms
48
+ # otherwise, the assets will not be available
49
+ #if "table_cleaning" in config:
50
+ # table_cleaners = config["table_cleaning"]
51
+ # self.table_cleaners = [
52
+ # table_cleaning.from_config(name) for name in table_cleaners
53
+ # ]
54
+
55
+ def process(self, pdf_filepath: str) -> dict:
56
+ logging.info(f"Processing {pdf_filepath}")
57
+
58
+ assets = {
59
+ "pagefilter": {},
60
+ "table_extractors": [],
61
+ "table_cleaners": [],
62
+ }
63
+
64
+ # Identifying the pages to extract
65
+ self.page_filter(pdf_filepath, assets)
66
+
67
+ # Now that we identified the pages to be extracted, we extract them
68
+ # Note, in a GUI, we could ask the user to the change the content of
69
+ # assets["pagefilter"]["selected_pages"] before selecting the pages
70
+ pdf_to_process = keep_pages(
71
+ pdf_filepath,
72
+ assets["pagefilter"]["selected_pages"],
73
+ )
74
+
75
+ # Process the selected pages to detect the tables and extract
76
+ # their contents
77
+ for table_extractor in self.table_extractors:
78
+ new_asset = table_extractor(pdf_to_process)
79
+ assets["table_extractors"].append(new_asset)
80
+
81
+ # Give the parsed content to the cleaner stage for getting organized data
82
+ #for table_cleaner in self.table_cleaners:
83
+ # for asset in assets["table_extractors"]:
84
+ # new_asset = table_cleaner(asset)
85
+ # assets["table_cleaners"].append(new_asset)
86
+
87
+ return assets
country_by_country/table_cleaning/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Local imports
24
+ from .llm_cleaner import LLMCleaner
25
+
26
+
27
+ def from_config(config: dict) -> LLMCleaner:
28
+ extractor_type = config["type"]
29
+ extractor_params = {}
30
+ if "params" in config:
31
+ extractor_params = config["params"]
32
+ if extractor_type == "LLM":
33
+ return LLMCleaner(**extractor_params)
34
+ return None
country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (497 Bytes). View file
 
country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc ADDED
Binary file (5.25 kB). View file
 
country_by_country/table_cleaning/llm_cleaner.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ import logging
25
+ import uuid
26
+
27
+ import pandas as pd
28
+
29
+ # External imports
30
+ from IPython.display import display
31
+ from langchain.prompts import PromptTemplate
32
+ from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
33
+ from langchain_core.pydantic_v1 import BaseModel, Field
34
+ from langchain_openai import ChatOpenAI
35
+
36
+ from country_by_country.utils import constants
37
+
38
+
39
+ class LLMCleaner:
40
+ def __init__(self, **kwargs: dict) -> None:
41
+ """
42
+ Builds a table cleaner, by extracting clean data from tables
43
+ extracted during table extraction stage.
44
+ The kwargs given to the constructor are directly propagated
45
+ to the LLMCleaner constructor.
46
+ You are free to define any parameter LLMCleaner recognizes.
47
+ """
48
+ self.kwargs = kwargs
49
+ self.type = "llm_cleaner"
50
+ self.openai_model = self.kwargs["openai_model"]
51
+
52
+ def __call__(self, asset: dict) -> dict:
53
+ logging.info("\nKicking off cleaning stage...")
54
+ logging.info(f"Cleaning type: {self.type}, with params: {self.kwargs}")
55
+ logging.info(
56
+ f"Input extraction type: {asset['type']}, with params: {asset['params']}",
57
+ )
58
+
59
+ # Extract tables from previous stage
60
+ tables = asset["tables"]
61
+
62
+ logging.info(f"Pulling {len(tables)} tables from extraction stage")
63
+
64
+ # Convert tables to html to add to LLM prompt
65
+ html_tables = [table.to_html() for table in tables]
66
+
67
+ # Define our LLM model
68
+ model = ChatOpenAI(temperature=0, model=self.openai_model)
69
+
70
+ # ---------- CHAIN 1/2 - Pull countries from each table ----------
71
+ logging.info("Starting chain 1/2: extracting country names from tables")
72
+
73
+ # Output should have this model (a list of country names)
74
+ class CountryNames(BaseModel):
75
+ country_names: list[str] = Field(
76
+ description="Exhaustive list of countries with financial data in the table",
77
+ enum=constants.COUNTRIES,
78
+ )
79
+
80
+ # Output should be a JSON with above schema
81
+ parser1 = JsonOutputParser(pydantic_object=CountryNames)
82
+
83
+ # Prompt includes one extracted table and some JSON output formatting instructions
84
+ prompt1 = PromptTemplate(
85
+ template="Extract an exhaustive list of countries from the following table "
86
+ + "in html format:\n{table}\n{format_instructions}",
87
+ input_variables=["table"],
88
+ partial_variables={
89
+ "format_instructions": parser1.get_format_instructions(),
90
+ },
91
+ )
92
+
93
+ # Chain
94
+ chain1 = {"table": lambda x: x} | prompt1 | model | parser1
95
+
96
+ # Run it
97
+ responses1 = chain1.batch(html_tables, {"max_concurrency": 4})
98
+
99
+ # Extract country lists from responses
100
+ country_lists = [resp["country_names"] for resp in responses1]
101
+
102
+ # ---------- CHAIN 2/2 - Pull financial data for each country ----------
103
+ logging.info("Starting chain 2/2: extracting financial data from tables")
104
+
105
+ # Define country data model
106
+ class Country(BaseModel):
107
+ """Financial data about a country"""
108
+
109
+ jur_name: str = Field(..., description="Name of the country")
110
+ total_revenues: float | None = Field(None, description="Total revenues")
111
+ profit_before_tax: float | None = Field(
112
+ None,
113
+ description="Amount of profit (or loss) before tax",
114
+ )
115
+ tax_paid: float | None = Field(None, description="Income tax paid")
116
+ tax_accrued: float | None = Field(None, description="Accrued tax")
117
+ employees: float | None = Field(None, description="Number of employees")
118
+ stated_capital: float | None = Field(None, description="Stated capital")
119
+ accumulated_earnings: float | None = Field(
120
+ None,
121
+ description="Accumulated earnings",
122
+ )
123
+ tangible_assets: float | None = Field(
124
+ None,
125
+ description="Tangible assets other than cash and cash equivalent",
126
+ )
127
+
128
+ # Output should have this model (a list of country objects)
129
+ class Countries(BaseModel):
130
+ """Extracting financial data for each country"""
131
+
132
+ countries: list[Country]
133
+
134
+ # Output should be a JSON with above schema
135
+ parser2 = PydanticOutputParser(pydantic_object=Countries)
136
+
137
+ # Prompt includes one extracted table and some JSON output formatting instructions
138
+ template = (
139
+ """You are an assistant tasked with extracting financial """
140
+ + """data about {country_list} from the following table in html format:\n
141
+ {table}\n
142
+ {format_instructions}
143
+ """
144
+ )
145
+
146
+ # Set up prompt
147
+ prompt = PromptTemplate.from_template(
148
+ template,
149
+ partial_variables={
150
+ "format_instructions": parser2.get_format_instructions(),
151
+ },
152
+ )
153
+
154
+ # Chain
155
+ chain2 = (
156
+ {"table": lambda x: x[0], "country_list": lambda x: x[1]}
157
+ | prompt
158
+ | model.with_structured_output(Countries)
159
+ )
160
+
161
+ # Run it
162
+ responses2 = chain2.batch(
163
+ list(zip(html_tables, country_lists, strict=True)),
164
+ {"max_concurrency": 4},
165
+ )
166
+
167
+ # Merge the tables into one dataframe
168
+ df = pd.concat(
169
+ [pd.json_normalize(resp.dict()["countries"]) for resp in responses2],
170
+ ).reset_index(drop=True)
171
+
172
+ # Display
173
+ display(df)
174
+
175
+ # Create asset
176
+ new_asset = {
177
+ "id": uuid.uuid4(),
178
+ "type": self.type,
179
+ "params": self.kwargs,
180
+ "table": df,
181
+ }
182
+
183
+ return new_asset
country_by_country/table_extraction/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Local imports
24
+ import logging
25
+ import sys
26
+
27
+ from .camelot_extractor import Camelot
28
+ from .from_csv import FromCSV
29
+ from .llama_parse_extractor import LlamaParseExtractor
30
+ from .unstructured import Unstructured
31
+ from .unstructured_api import UnstructuredAPI
32
+
33
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
34
+
35
+
36
+ def from_config(config: dict) -> Camelot:
37
+ extractor_type = config["type"]
38
+ extractor_params = {}
39
+ if "params" in config:
40
+ extractor_params = config["params"]
41
+ if extractor_type == "Camelot":
42
+ return Camelot(**extractor_params)
43
+ elif extractor_type == "FromCSV":
44
+ return FromCSV(**extractor_params)
45
+ elif extractor_type == "Unstructured":
46
+ return Unstructured(**extractor_params)
47
+ elif extractor_type == "UnstructuredAPI":
48
+ return UnstructuredAPI(**extractor_params)
49
+ elif extractor_type == "LlamaParse":
50
+ return LlamaParseExtractor(**extractor_params)
51
+ elif extractor_type == "ExtractTableAPI":
52
+ # This is for legacy support
53
+ # In order to be able to use ExtractTable
54
+ # for benchmarking
55
+ # Note: ExtractTable-py is not maintained anymore
56
+ # This is the reason why this case is handled in a specific way
57
+ from .extract_table_api import ExtractTableAPI
58
+
59
+ return ExtractTableAPI(**extractor_params)
60
+ else:
61
+ logging.info(f"There are no extractors of the type : {extractor_type}")
country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc ADDED
Binary file (1.3 kB). View file
 
country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc ADDED
Binary file (1.69 kB). View file
 
country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc ADDED
Binary file (1.24 kB). View file
 
country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc ADDED
Binary file (1.89 kB). View file
 
country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc ADDED
Binary file (2.24 kB). View file
 
country_by_country/table_extraction/camelot_extractor.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ import logging
25
+ import uuid
26
+
27
+ # External imports
28
+ import camelot
29
+
30
+
31
+ class Camelot:
32
+ def __init__(self, flavor: str) -> None:
33
+ self.flavor = flavor
34
+ self.type = "camelot"
35
+
36
+ def __call__(self, pdf_filepath: str) -> dict:
37
+ """
38
+ Returns asset that contain:
39
+ tables: a list of pandas dataframe of the parsed tables
40
+ """
41
+ logging.info("\nKicking off extraction stage...")
42
+ logging.info(f"Extraction type: {self.type}, with params: {self.flavor}")
43
+
44
+ tables = camelot.read_pdf(pdf_filepath, flavor=self.flavor)
45
+
46
+ # Write the parsed tables into the assets
47
+ tables_list = [t.df for t in tables]
48
+
49
+ # Create asset
50
+ new_asset = {
51
+ "id": uuid.uuid4(),
52
+ "type": "camelot",
53
+ "params": {"flavor": self.flavor},
54
+ "tables": tables_list,
55
+ }
56
+
57
+ return new_asset
country_by_country/table_extraction/extract_table_api.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2024 dataforgood
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Standard imports
24
+ import os
25
+ import uuid
26
+
27
+ # External imports
28
+ try:
29
+ from ExtractTable import ExtractTable
30
+ except ImportError as e:
31
+
32
+ class ExtractTableModuleException(Exception):
33
+ def __init__(self) -> None:
34
+ super().__init__("You must install ExtractTable : pip install ExtractTable")
35
+
36
+ raise ExtractTableModuleException() from e
37
+
38
+
39
+ class ExtractTableAPI:
40
+ def __init__(self) -> None:
41
+ api_key = os.getenv("EXTRACT_TABLE_API_KEY")
42
+ self.extract_table = ExtractTable(api_key)
43
+
44
+ def __call__(self, pdf_filepath: str) -> None:
45
+ """
46
+ Writes assets:
47
+ ntables: the number of detected tables
48
+ tables: a list of pandas dataframe of the parsed tables
49
+ """
50
+ tables_list = self.extract_table.process_file(
51
+ filepath=pdf_filepath,
52
+ pages="all",
53
+ output_format="df",
54
+ )
55
+
56
+ # Create asset
57
+ new_asset = {
58
+ "id": uuid.uuid4(),
59
+ "type": "ExtractTableAPI",
60
+ "tables": tables_list,
61
+ }
62
+
63
+ return new_asset