Spaces:
Runtime error
Runtime error
kernelmachine
commited on
Commit
•
842e849
1
Parent(s):
893dac4
added model
Browse files- app.py +69 -4
- model/archive/classes.npy +0 -0
- model/archive/coef.npy +0 -0
- model/archive/intercept.npy +0 -0
- model/best_hyperparameters.json +1 -0
- model/results.jsonl +1 -0
- score.py +23 -0
app.py
CHANGED
@@ -1,7 +1,72 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import pathlib
|
8 |
+
import random
|
9 |
+
import shutil
|
10 |
+
import time
|
11 |
+
from typing import Any, Dict, List, Union
|
12 |
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
|
16 |
+
TfidfVectorizer)
|
17 |
+
from sklearn.linear_model import LogisticRegression
|
18 |
+
from sklearn.metrics import f1_score
|
19 |
+
from sklearn.model_selection import train_test_split
|
20 |
+
from tqdm import tqdm
|
21 |
+
from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch
|
22 |
+
from shutil import rmtree
|
23 |
|
24 |
+
|
25 |
+
def load_model(serialization_dir):
|
26 |
+
with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
|
27 |
+
hyperparameters = json.load(f)
|
28 |
+
if hyperparameters.pop('stopwords') == 1:
|
29 |
+
stop_words = 'english'
|
30 |
+
else:
|
31 |
+
stop_words = None
|
32 |
+
weight = hyperparameters.pop('weight')
|
33 |
+
if weight == 'binary':
|
34 |
+
binary = True
|
35 |
+
else:
|
36 |
+
binary = False
|
37 |
+
ngram_range = hyperparameters.pop('ngram_range')
|
38 |
+
ngram_range = sorted([int(x) for x in ngram_range.split()])
|
39 |
+
if weight == 'tf-idf':
|
40 |
+
vect = TfidfVectorizer(stop_words=stop_words,
|
41 |
+
lowercase=True,
|
42 |
+
ngram_range=ngram_range)
|
43 |
+
elif weight == 'hash':
|
44 |
+
vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
|
45 |
+
else:
|
46 |
+
vect = CountVectorizer(binary=binary,
|
47 |
+
stop_words=stop_words,
|
48 |
+
lowercase=True,
|
49 |
+
ngram_range=ngram_range)
|
50 |
+
if weight != "hash":
|
51 |
+
with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
|
52 |
+
vocab = json.load(f)
|
53 |
+
vect.vocabulary_ = vocab
|
54 |
+
hyperparameters['C'] = float(hyperparameters['C'])
|
55 |
+
hyperparameters['tol'] = float(hyperparameters['tol'])
|
56 |
+
classifier = LogisticRegression(**hyperparameters)
|
57 |
+
if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
|
58 |
+
vect.idf_ = np.load(os.path.join(serialization_dir, "archive", "idf.npy"))
|
59 |
+
classifier.coef_ = np.load(os.path.join(serialization_dir, "archive", "coef.npy"))
|
60 |
+
classifier.intercept_ = np.load(os.path.join(serialization_dir, "archive", "intercept.npy"))
|
61 |
+
classifier.classes_ = np.load(os.path.join(serialization_dir, "archive", "classes.npy"))
|
62 |
+
return classifier, vect
|
63 |
+
|
64 |
+
def score(x, clf, vectorizer):
|
65 |
+
# score a single document
|
66 |
+
return clf.predict_proba(vectorizer.transform([x]))
|
67 |
+
|
68 |
+
clf, vectorizer = load_model("model/")
|
69 |
+
|
70 |
+
def start(text):
|
71 |
+
k = round(score(text, clf, vectorizer)[0][1], 2)
|
72 |
+
return {"GPT-3 Filter Quality Score": k }
|
model/archive/classes.npy
ADDED
Binary file (144 Bytes). View file
|
|
model/archive/coef.npy
ADDED
Binary file (8.39 MB). View file
|
|
model/archive/intercept.npy
ADDED
Binary file (136 Bytes). View file
|
|
model/best_hyperparameters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"C": 0.977778, "multi_class": "auto", "ngram_range": "1 2", "penalty": "l1", "random_state": 44555, "solver": "liblinear", "stopwords": null, "tol": 0.000816, "weight": "hash"}
|
model/results.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"C":0.977778,"dev_accuracy":0.9332787322,"dev_f1":0.9226169818,"multi_class":"auto","ngram_range":"[1, 2]","penalty":"l1","random_state":44555,"solver":"liblinear","stopwords":null,"tol":0.000816,"training_duration":807.7028501034,"weight":"hash"}
|
score.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm.auto import tqdm
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def score_text(df, clf, clf_vectorizer, field='text'):
|
6 |
+
## score text using quality filter
|
7 |
+
df['filter_output'] = clf.predict_proba(clf_vectorizer.transform(tqdm(df[field]))).tolist()
|
8 |
+
df['prob_low_quality'] = df.filter_output.apply(lambda x: x[0])
|
9 |
+
df['prob_high_quality'] = df.filter_output.apply(lambda x: x[1])
|
10 |
+
df = df.drop(['filter_output'], axis=1)
|
11 |
+
df['GPT3_included'] = df.prob_high_quality.apply(lambda x: np.random.pareto(9) > (1 - x))
|
12 |
+
|
13 |
+
return df
|
14 |
+
|
15 |
+
def get_counts(df, field='text'):
|
16 |
+
# count number of whitespace tokens
|
17 |
+
tqdm.pandas()
|
18 |
+
df['num_tokens'] = df[field].progress_apply(lambda x: len(x.split()))
|
19 |
+
return df
|
20 |
+
|
21 |
+
def score(x, clf, vectorizer):
|
22 |
+
# score a single document
|
23 |
+
return clf.predict_proba(vectorizer.transform([x]))
|