Spaces:

anton-l
/

html-viz

Runtime error

App Files Files Community

anton-l HF staff commited on Mar 8, 2023

Commit

b2b504b

•

1 Parent(s): db7c020

init

Browse files

Files changed (2) hide show

app.py +55 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import re
+import gradio as gr
+import requests
+from inscriptis import get_text
+from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.model.config import ParserConfig
+from readability import Document
+INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])
+def extract_text(url: str):
+    html = requests.get(url).content.decode("utf-8")
+    if len(html.strip()) == 0:
+        return "", "", "", ""
+    parsed_doc = Document(html)
+    # get the body of the article with readability-lxml
+    title = parsed_doc.short_title()
+    clean_html = parsed_doc.summary(html_partial=True)
+    del parsed_doc
+    # get the formatted plaintext with inscriptis
+    text = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
+    if not re.search(r"\w+", text):
+        # no words found, only whitespace and punctuation
+        return title, "", clean_html, html
+    # remove excessive empty lines
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    return title, text, clean_html, html
+title = gr.Textbox(label="Title")
+text = gr.Textbox(label="Text", lines=10)
+clean_html = gr.Textbox(label="Clean HTML", lines=10)
+html = gr.Textbox(label="Raw HTML", lines=10)
+demo = gr.Interface(
+    extract_text,
+    gr.Textbox(placeholder="https://hf.co/", label="URL"),
+    [title, text, clean_html, html],
+    examples=[
+        ["https://huggingface.co/blog/peft"],
+        [
+            "https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html"
+        ],
+    ],
+)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/huggingface/python-readability@speedup
2	+ inscriptis==2.3.2