HTML-to-Markdown

Running on Zero

App Files Files Community

HTML-to-Markdown / app.py

Nymbo

Update app.py

a3d71ce verified about 2 months ago

raw

history blame contribute delete

3.05 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import spaces
	import re
	from markdownify import markdownify


	models = {
	"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
	"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
	}

	tokenizers = {
	"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
	"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
	}


	@spaces.GPU
	def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
	print("Start Model Processing")
	model = models[model_id]
	tokenizer = tokenizers[model_id]
	messages = [{"role": "user", "content": html_content}]
	input_text=tokenizer.apply_chat_template(messages, tokenize=False)
	inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
	outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
	pattern = r"<\\|im_start\\|>assistant(.*?)<\\|im_end\\|>"
	assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
	print("Start Markdownify Processing")
	markdownify_output = markdownify(html_content)
	return assistant_response[0], markdownify_output


	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	example_html = """<div id="myDIV" class="header">
	<h2>My To Do List</h2>
	<input type="text" id="myInput" placeholder="Title...">
	<span onclick="newElement()" class="addBtn">Add</span>
	</div>

	<ul id="myUL">
	<li>Hit the gym</li>
	<li class="checked">Pay bills</li>
	<li>Meet George</li>
	<li>Buy eggs</li>
	<li>Read a book</li>
	<li>Organize office</li>
	</ul>"""

	with gr.Blocks(css=css, theme="Nymbo/Nymbo_Theme") as demo:
	gr.Markdown("""
	# HTML-to-Markdown
	Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
	""")
	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b")
	html_content = gr.Textbox(label="HTML")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	model_output_text = gr.Textbox(label="Reader LM Output")
	markdownify_output = gr.Textbox(label="Markdownify Output")

	gr.Examples(
	examples=[
	[example_html],
	],
	inputs=[html_content],
	outputs=[model_output_text, markdownify_output],
	fn=run_example,
	cache_examples=True,
	label="Try examples"
	)

	submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])

	demo.launch(debug=True)