Spaces:
No application file
No application file
# from itertools import count | |
import re | |
import numpy as np | |
import pandas as pd | |
import operator as op | |
from uuid import uuid4 | |
import pycountry | |
from pyserini.search import LuceneSearcher, FaissSearcher | |
from pyserini.search.hybrid import HybridSearcher | |
from encoder import SentenceTransformerEncoder | |
def load_data(path, langs): | |
df = pd.read_csv(path, sep='\t') | |
if 'uuid' not in df.columns: | |
df['uuid'] = [uuid4() for _ in range(df.shape[0])] | |
df.to_csv(path, index=False, sep='\t') | |
df = df[df['lang'].isin(langs)] # filter relevant language | |
return df | |
def get_country(code: str) -> str: | |
if code: | |
country = pycountry.countries.get(alpha_2=code) | |
if country: | |
# print(country.name) | |
return country.name | |
else: | |
if "-" in code: | |
cs = code.split("-") | |
country = pycountry.countries.get(alpha_2=cs[0]) | |
if country: | |
return country.name+f" ({'-'.join(cs[1:])})" | |
return code | |
else: | |
return "" | |
def load_corpus(corpus_path, langs, pattern): | |
corpus_df = load_data(corpus_path, langs) | |
corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: np.nan if x == "" else x) | |
corpus_df["uuid"] = corpus_df["uuid"].apply(lambda x: str(x)) | |
corpus_df = corpus_df.dropna(subset=["sentence"]) | |
corpus_df = corpus_df.drop_duplicates(subset="id") | |
corpus_df = corpus_df.drop_duplicates() | |
corpus_df.index = corpus_df["uuid"].apply(lambda x: str(x)) | |
corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: x.lower()) | |
corpus_df["alpha_sentence"] = corpus_df["sentence"].apply(lambda x: re.sub(pattern, '', x).lower()) | |
corpus_df["countries"] = corpus_df["countries"].apply(lambda x: x.replace("'","").replace(" ", "").replace("[", "").replace("]", "").split(",")) | |
corpus_df["countries"] = corpus_df["countries"].apply(lambda x: [get_country(c) for c in x]) | |
return corpus_df | |
def filter_years(corpus_df, year): | |
corpus_df = corpus_df[(corpus_df["date"] >= year[0]) & (corpus_df["date"] <= year[1])] | |
return corpus_df | |
def filter_corpus(corpus_df, values, column, row_type=list): | |
def check_op(all_rows, list2, rtype): | |
rows = [] | |
for i,row in enumerate(all_rows): | |
if rtype == list: | |
for e in list2: | |
if op.countOf(row, e) > 0: | |
rows.append(i) | |
elif rtype == str: | |
for e in list2: | |
if e in row: | |
rows.append(i) | |
return rows | |
idxs = check_op(corpus_df[column].tolist(), values, row_type) | |
corpus_df = corpus_df.loc[corpus_df.index[idxs]] | |
return corpus_df | |
def load_searchers(index_name, model_name): | |
ssearcher = LuceneSearcher(index_name) | |
encoder = SentenceTransformerEncoder(model_name) | |
dsearcher = FaissSearcher(index_name, encoder) | |
hsearcher = HybridSearcher(sparse_searcher=ssearcher, dense_searcher=dsearcher) | |
return hsearcher | |