Spaces:

Temuzin64
/

Telugu_TextExtraction

Sleeping

App Files Files Community

Telugu_TextExtraction / src /streamlit_app.py

Temuzin64

Update src/streamlit_app.py

2d493f2 verified 7 months ago

raw

history blame contribute delete

2.57 kB

	import streamlit as st
	from PIL import Image, ImageFilter, ImageEnhance
	import tempfile
	import os
	import easyocr
	from transformers import MT5ForConditionalGeneration, MT5Tokenizer, pipeline

	# Load tokenizer and model once at startup with proper config to avoid warnings
	tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", legacy=False, use_fast=False)
	model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
	pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

	# Preprocess uploaded image to improve OCR accuracy
	def preprocess_image_pillow(image):
	img = image.convert("L") # Grayscale
	width, height = img.size
	img = img.resize((width * 2, height * 2), Image.LANCZOS)
	enhancer = ImageEnhance.Contrast(img)
	img = enhancer.enhance(2.0)
	img = img.filter(ImageFilter.SHARPEN)
	return img

	# Streamlit App UI
	st.set_page_config(page_title="📝 Telugu OCR & Correction", layout="centered")
	st.title("📝 Telugu Handwriting to Typed Text")

	uploaded_file = st.file_uploader("📤 Upload Telugu handwritten image", type=["png", "jpg", "jpeg"])

	if uploaded_file:
	image = Image.open(uploaded_file).convert("RGB")
	enhanced_image = preprocess_image_pillow(image)
	st.image(enhanced_image, caption="Preprocessed Image", use_container_width=True)

	# Save temporarily for EasyOCR
	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp:
	enhanced_image.save(temp.name)

	try:
	reader = easyocr.Reader(['te'], gpu=False)
	results = reader.readtext(temp.name)

	raw_text = "\n".join([text for (_, text, _) in results])

	st.markdown("### 📄 OCR Extracted Text")
	st.text_area("📝 Telugu OCR", raw_text, height=150)

	# Generate correction using mT5
	if raw_text.strip():
	st.markdown("### ✅ LLM Corrected Telugu Text")
	prompt = f"Correct the following Telugu text spelling and grammar:\n{raw_text}"
	try:
	response = pipe(prompt, max_new_tokens=256, do_sample=False)[0]['generated_text']
	st.text_area("🤖 Corrected Text", response, height=150)
	st.download_button("⬇️ Download", response, file_name="corrected_telugu.txt")
	except Exception as e:
	st.error(f"LLM Correction Error: {e}")
	else:
	st.warning("OCR did not extract any usable Telugu text.")
	finally:
	# Always remove the temp file
	if os.path.exists(temp.name):
	os.remove(temp.name)