Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,293 Bytes
27e0148 ad4860f cd29aa5 c05248f 8222ac1 27e0148 a970ae0 27e0148 fbc19a1 8222ac1 e543e92 8222ac1 e543e92 c05248f 3f1630b 8222ac1 27e0148 8e28628 10c1114 32490fd 10c1114 27e0148 32490fd 27e0148 140f5d3 27e0148 f05ca9d 27e0148 639f8ea 27e0148 49e03fb 27e0148 c824141 27e0148 8e28628 3f1630b 8e28628 10c1114 32490fd 10c1114 457910c d34f941 c824141 ad4860f 32490fd ad4860f 27e0148 8f57a99 cd29aa5 f22f3ec cd29aa5 8f57a99 47f2421 8f57a99 a13202f 8f57a99 32490fd a127a18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import os
import duckdb
import gradio as gr
from httpx import Client
from huggingface_hub import HfApi
import pandas as pd
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import spaces
from llama_cpp import Llama
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
headers = {
"Accept" : "application/json",
"Content-Type": "application/json"
}
client = Client(headers=headers)
api = HfApi()
llama = Llama(
model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
n_ctx=2048,
n_gpu_layers=50
)
@spaces.GPU
def generate_sql(prompt):
# pred = pipe(prompt, max_length=1000)
# return pred[0]["generated_text"]
pred = llama(prompt, temperature=0.1, max_tokens=1000)
return pred["choices"][0]["text"]
def get_first_parquet(dataset: str):
resp = client.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
return resp.json()["parquet_files"][0]
def text2sql(dataset_name, query_input):
print(f"start text2sql for {dataset_name}")
try:
first_parquet = get_first_parquet(dataset_name)
except Exception as error:
return {
schema_output: "",
prompt_output: "",
query_output: "",
df:pd.DataFrame([{"error": f"β Could not get dataset schema. {error=}"}])
}
first_parquet_url = first_parquet["url"]
print(f"getting schema from {first_parquet_url}")
con = duckdb.connect()
con.execute("INSTALL 'httpfs'; LOAD httpfs;")
# could get from Parquet instead?
con.execute(f"CREATE TABLE data as SELECT * FROM '{first_parquet_url}' LIMIT 1;")
result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
ddl_create = result.iloc[0,0]
text = f"""### Instruction:
Your task is to generate valid duckdb SQL to answer the following question.
### Input:
Here is the database schema that the SQL query will run on:
{ddl_create}
### Question:
{query_input}
### Response (use duckdb shorthand if possible):
"""
try:
sql_output = generate_sql(text)
except Exception as error:
return {
schema_output: ddl_create,
prompt_output: text,
query_output: "",
df:pd.DataFrame([{"error": f"β Unable to get the SQL query based on the text. {error=}"}])
}
# Should be replaced by the prompt but not working
sql_output = sql_output.replace("FROM data", f"FROM '{first_parquet_url}'")
try:
query_result = con.sql(sql_output).df()
except Exception as error:
query_result = pd.DataFrame([{"error": f"β Could not execute SQL query {error=}"}])
finally:
con.close()
return {
schema_output: ddl_create,
prompt_output: text,
query_output:sql_output,
df:query_result
}
with gr.Blocks() as demo:
gr.Markdown("# π« Generate SQL queries based on a given text for your Hugging Face Dataset π«")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
value="jamescalam/world-cities-geo",
)
# dataset_name = gr.Textbox("jamescalam/world-cities-geo", label="Dataset Name")
query_input = gr.Textbox("Cities from Albania country", label="Ask something about your data")
examples = [
["Cities from Albania country"],
["The continent with the most number of countries"],
["Cities that start with 'A'"],
["Cities by region"],
]
gr.Examples(examples=examples, inputs=[query_input],outputs=[])
btn = gr.Button("Generate SQL")
query_output = gr.Textbox(label="Output SQL", interactive= False)
df = gr.DataFrame(datatype="markdown")
with gr.Accordion("Open for prompt details", open=False):
#with gr.Column(scale=1, min_width=600):
schema_output = gr.Textbox(label="Parquet Schema as CREATE DDL", interactive= False)
prompt_output = gr.Textbox(label="Generated prompt", interactive= False)
btn.click(text2sql, inputs=[dataset_name, query_input], outputs=[schema_output, prompt_output, query_output,df])
demo.launch(debug=True)
|