File size: 1,966 Bytes
9e80f82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import PyPDF2

def load_single_document(file_path: str):
    # Loads a single document from file path
    if file_path[-4:] == '.txt':
        with open(file_path, 'r') as f:
            return f.read()

    elif file_path[-4:] == '.pdf':
        pdfFileObj = open(file_path, 'rb')
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        text = ''
        for page in pdfReader.pages:
            text += page.extract_text()
        return text

    elif file_path[-4:] == '.csv':
        with open(file_path, 'r') as f:
            return f.read()

    else:
        raise Exception('Invalid file type')


def load_documents(file_paths: list[str] = None, source_dir: str = None):
    # Loads all documents from source documents directory
    if file_paths:
        all_files = file_paths
    elif source_dir:
        all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
    else:
        raise Exception('No file paths or source directory provided')

    return [
            {
                'name': os.path.basename(file_path),
                'content': load_single_document(f"{file_path}")
            } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
        ]

def load_io(file_byte = None):
    # Loads a single document from file path
    if file_byte.name[-3:] == 'txt':
        return file_byte.read().decode("utf-8")

    elif file_byte.name[-3:] == 'pdf':
        pdfReader = PyPDF2.PdfReader(file_byte)
        text = ''
        for page in pdfReader.pages:
            text += page.extract_text()
        return text

    else:
        raise Exception('Invalid file type')

def load_btyes_io(files = None):

    return [
        {
            'name': file_btye.name,
            'content': load_io(file_btye)
        } for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
    ]