Spaces:

UnMelow
/

422_tasks

Sleeping

App Files Files Community

UnMelow commited on 19 days ago

Commit

d77255d

verified ·

1 Parent(s): 26508ea

Update app.py

Browse files

Files changed (1) hide show

app.py +363 -488

app.py CHANGED Viewed

@@ -1,12 +1,8 @@
 import os
-import sys
 import re
-import shutil
 import tempfile
-import warnings
-import base64
-from io import StringIO, BytesIO
-from typing import List, Tuple
 import gradio as gr
 import torch
@@ -15,185 +11,61 @@ from PIL import Image, ImageDraw, ImageFont, ImageOps
 import fitz  # PyMuPDF
 from transformers import (
-    AutoModel,
-    AutoTokenizer,
     AutoProcessor,
     VisionEncoderDecoderModel,
     BlipProcessor,
     BlipForConditionalGeneration,
 )
-# --- Optional HF Spaces GPU decorator (safe fallback for local runs) ---
-try:
-    import spaces  # type: ignore
-    gpu_decorator = spaces.GPU
-except Exception:
-    def gpu_decorator(*args, **kwargs):
-        def wrap(fn):
-            return fn
-        return wrap
-# =========================
-# Device / dtype utilities
-# =========================
-def get_device() -> str:
-    return "cuda" if torch.cuda.is_available() else "cpu"
-def get_cuda_dtype() -> torch.dtype:
-    # bf16 only on supported GPUs (Ampere+). Otherwise fp16.
     try:
-        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-            return torch.bfloat16
     except Exception:
-        pass
-    return torch.float16
-DEVICE = get_device()
-CUDA_DTYPE = get_cuda_dtype() if DEVICE == "cuda" else torch.float32
-# =========================
-# Model names
-# =========================
-DEEPSEEK_OCR_NAME = os.getenv("DEEPSEEK_OCR_MODEL", "deepseek-ai/DeepSeek-OCR")
-# Optional pin to a specific revision/commit to avoid auto-updating remote code.
-DEEPSEEK_OCR_REVISION = os.getenv("DEEPSEEK_OCR_REVISION", None)
-TROCR_NAME = os.getenv("TROCR_MODEL", "microsoft/trocr-base-printed")
-BLIP_NAME = os.getenv("BLIP_MODEL", "Salesforce/blip-image-captioning-base")
-# =========================
-# Load DeepSeek-OCR safely
-# =========================
-def load_deepseek_ocr():
-    tokenizer = AutoTokenizer.from_pretrained(
-        DEEPSEEK_OCR_NAME,
-        trust_remote_code=True,
-        revision=DEEPSEEK_OCR_REVISION,
-    )
-    base_kwargs = dict(
-        trust_remote_code=True,
-        use_safetensors=True,
-        revision=DEEPSEEK_OCR_REVISION,
-    )
-    # IMPORTANT:
-    # - Do NOT force flash_attention_2 on CPU.
-    # - On CUDA: try flash_attention_2, but gracefully fallback if unavailable.
-    if DEVICE == "cuda":
-        # Try FlashAttention2 first
-        try:
-            model = AutoModel.from_pretrained(
-                DEEPSEEK_OCR_NAME,
-                torch_dtype=CUDA_DTYPE,
-                _attn_implementation="flash_attention_2",
-                **base_kwargs,
-            )
-        except Exception as e:
-            warnings.warn(
-                f"FlashAttention2 unavailable or failed ({e}). Falling back to SDPA/eager."
-            )
-            # Try SDPA
-            try:
-                model = AutoModel.from_pretrained(
-                    DEEPSEEK_OCR_NAME,
-                    torch_dtype=CUDA_DTYPE,
-                    _attn_implementation="sdpa",
-                    **base_kwargs,
-                )
-            except Exception:
-                # Final fallback
-                model = AutoModel.from_pretrained(
-                    DEEPSEEK_OCR_NAME,
-                    torch_dtype=CUDA_DTYPE,
-                    _attn_implementation="eager",
-                    **base_kwargs,
-                )
-        model = model.eval().to(DEVICE)
-    else:
-        # CPU path: no flash attention, use float32 for stability
-        model = AutoModel.from_pretrained(
-            DEEPSEEK_OCR_NAME,
-            torch_dtype=torch.float32,
-            _attn_implementation="eager",
-            **base_kwargs,
-        )
-        model = model.eval().to(DEVICE)
-    return tokenizer, model
-tokenizer, deepseek_model = load_deepseek_ocr()
-# =========================
-# Load TrOCR and BLIP
-# =========================
-def load_trocr():
-    processor = AutoProcessor.from_pretrained(TROCR_NAME)
-    model = VisionEncoderDecoderModel.from_pretrained(TROCR_NAME).eval()
-    if DEVICE == "cuda":
-        model = model.to(DEVICE).to(dtype=CUDA_DTYPE)
-    else:
-        model = model.to(DEVICE)
-    return processor, model
-def load_blip():
-    processor = BlipProcessor.from_pretrained(BLIP_NAME)
-    model = BlipForConditionalGeneration.from_pretrained(BLIP_NAME).eval()
-    if DEVICE == "cuda":
-        model = model.to(DEVICE).to(dtype=CUDA_DTYPE)
-    else:
-        model = model.to(DEVICE)
-    return processor, model
-trocr_processor, trocr_model = load_trocr()
-blip_processor, blip_model = load_blip()
-# =========================
-# App configs
-# =========================
-MODEL_CONFIGS = {
-    "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
-    "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-    "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-    "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-    "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-}
-TASK_PROMPTS = {
-    "📋 Markdown": {
-        "prompt": "<image>\n<|grounding|>Convert the document to markdown.",
-        "has_grounding": True,
-    },
-    # NOTE: Free OCR теперь делаем через TrOCR (быстро, text-only)
-    "📝 Free OCR": {"prompt": "", "has_grounding": False},
-    # Locate оставляем на DeepSeek (grounding)
-    "📍 Locate": {
-        "prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.",
-        "has_grounding": True,
-    },
-    # Describe теперь делаем через BLIP
-    "🔍 Describe": {"prompt": "", "has_grounding": False},
-    "✏️ Custom": {"prompt": "", "has_grounding": False},
-}
-# =========================
 # Helpers
-# =========================
-def safe_load_font(size: int = 30) -> ImageFont.FreeTypeFont:
     candidates = [
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
@@ -203,373 +75,376 @@ def safe_load_font(size: int = 30) -> ImageFont.FreeTypeFont:
             if os.path.exists(p):
                 return ImageFont.truetype(p, size)
         except Exception:
-            continue
     return ImageFont.load_default()
-def extract_grounding_references(text: str):
-    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
-    return re.findall(pattern, text, re.DOTALL)
-def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
-    img_w, img_h = image.size
-    img_draw = image.copy()
-    draw = ImageDraw.Draw(img_draw)
-    overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
-    draw2 = ImageDraw.Draw(overlay)
-    font = safe_load_font(30)
-    crops = []
-    color_map = {}
-    np.random.seed(42)
-    for ref in refs:
-        label = ref[1]
-        if label not in color_map:
-            color_map[label] = (
-                int(np.random.randint(50, 255)),
-                int(np.random.randint(50, 255)),
-                int(np.random.randint(50, 255)),
-            )
-        color = color_map[label]
-        try:
-            coords = eval(ref[2])
-        except Exception:
-            continue
-        color_a = color + (60,)
-        for box in coords:
-            x1, y1, x2, y2 = (
-                int(box[0] / 999 * img_w),
-                int(box[1] / 999 * img_h),
-                int(box[2] / 999 * img_w),
-                int(box[3] / 999 * img_h),
-            )
-            if extract_images and label == "image":
-                crops.append(image.crop((x1, y1, x2, y2)))
-            width = 5 if label == "title" else 3
-            draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
-            draw2.rectangle([x1, y1, x2, y2], fill=color_a)
-            text_bbox = draw.textbbox((0, 0), label, font=font)
-            tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
-            ty = max(0, y1 - 20)
-            draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
-            draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
-    img_draw.paste(overlay, (0, 0), overlay)
-    return img_draw, crops
-def clean_output(text: str, include_images: bool = False) -> str:
-    if not text:
         return ""
-    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
-    matches = re.findall(pattern, text, re.DOTALL)
-    img_num = 0
-    for match in matches:
-        if "<|ref|>image<|/ref|>" in match[0]:
-            if include_images:
-                text = text.replace(match[0], f"\n\n**[Figure {img_num + 1}]**\n\n", 1)
-                img_num += 1
-            else:
-                text = text.replace(match[0], "", 1)
-        else:
-            text = re.sub(rf"(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?", "", text)
-    return text.strip()
-def embed_images(markdown: str, crops: List[Image.Image]) -> str:
-    if not crops:
-        return markdown
-    for i, img in enumerate(crops):
-        buf = BytesIO()
-        img.save(buf, format="PNG")
-        b64 = base64.b64encode(buf.getvalue()).decode()
-        markdown = markdown.replace(
-            f"**[Figure {i + 1}]**",
-            f"\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n",
-            1,
-        )
-    return markdown
-def trocr_ocr(image: Image.Image) -> str:
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    pixel_values = trocr_processor(images=image, return_tensors="pt").pixel_values.to(DEVICE)
-    with torch.no_grad():
-        # Keep generation modest (faster)
-        generated_ids = trocr_model.generate(pixel_values, max_new_tokens=256)
-    text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return text.strip()
-def blip_describe(image: Image.Image) -> str:
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    inputs = blip_processor(images=image, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        out = blip_model.generate(**inputs, max_new_tokens=80)
-    caption = blip_processor.decode(out[0], skip_special_tokens=True)
-    return caption.strip()
-# =========================
-# Core processing
-# =========================
-@gpu_decorator(duration=60)
-def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
-    if image is None:
-        return "Error: upload image", "", "", None, []
-    if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
-        return "Error: enter prompt", "", "", None, []
-    if image.mode in ("RGBA", "LA", "P"):
-        image = image.convert("RGB")
-    image = ImageOps.exif_transpose(image)
-    # --- Route tasks to the best backend ---
-    if task == "📝 Free OCR":
-        text = trocr_ocr(image)
-        if not text:
-            return "No text", "", "", None, []
-        md = "```text\n" + text + "\n```"
-        return text, md, text, None, []
-    if task == "🔍 Describe":
-        desc = blip_describe(image)
-        if not desc:
-            return "No description", "", "", None, []
-        md = f"**Description:** {desc}"
-        return desc, md, desc, None, []
-    # --- DeepSeek-OCR for Markdown / Locate / Custom ---
-    config = MODEL_CONFIGS[mode]
-    if task == "✏️ Custom":
-        prompt = f"<image>\n{custom_prompt.strip()}"
-        has_grounding = "<|grounding|>" in custom_prompt
-    elif task == "📍 Locate":
-        prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
-        has_grounding = True
-    else:
-        prompt = TASK_PROMPTS[task]["prompt"]
-        has_grounding = TASK_PROMPTS[task]["has_grounding"]
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
-    image.save(tmp.name, "JPEG", quality=95)
-    tmp.close()
-    out_dir = tempfile.mkdtemp()
-    stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        deepseek_model.infer(
-            tokenizer=tokenizer,
-            prompt=prompt,
-            image_file=tmp.name,
-            output_path=out_dir,
-            base_size=config["base_size"],
-            image_size=config["image_size"],
-            crop_mode=config["crop_mode"],
-        )
-        result = "\n".join(
-            [
-                l
-                for l in sys.stdout.getvalue().split("\n")
-                if not any(
-                    s in l
-                    for s in [
-                        "image:",
-                        "other:",
-                        "PATCHES",
-                        "====",
-                        "BASE:",
-                        "%|",
-                        "torch.Size",
-                    ]
-                )
-            ]
-        ).strip()
-    finally:
-        sys.stdout = stdout
         try:
-            os.unlink(tmp.name)
         except Exception:
             pass
-        shutil.rmtree(out_dir, ignore_errors=True)
-    if not result:
-        return "No text", "", "", None, []
-    cleaned = clean_output(result, include_images=False)
-    markdown = clean_output(result, include_images=True)
-    img_out = None
-    crops = []
-    if has_grounding and "<|ref|>" in result:
-        refs = extract_grounding_references(result)
-        if refs:
-            img_out, crops = draw_bounding_boxes(image, refs, extract_images=True)
-    markdown = embed_images(markdown, crops)
-    return cleaned, markdown, result, img_out, crops
-@gpu_decorator(duration=60)
-def process_pdf(path: str, mode: str, task: str, custom_prompt: str, page_num: int):
-    doc = fitz.open(path)
-    total_pages = len(doc)
-    if page_num < 1 or page_num > total_pages:
-        doc.close()
-        return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
-    page = doc.load_page(page_num - 1)
-    pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
-    img = Image.open(BytesIO(pix.tobytes("png")))
-    doc.close()
-    return process_image(img, mode, task, custom_prompt)
-def process_file(path: str, mode: str, task: str, custom_prompt: str, page_num: int):
     if not path:
-        return "Error: upload file", "", "", None, []
-    if path.lower().endswith(".pdf"):
-        return process_pdf(path, mode, task, custom_prompt, page_num)
-    return process_image(Image.open(path), mode, task, custom_prompt)
-def toggle_prompt(task: str):
-    if task == "✏️ Custom":
-        return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
-    if task == "📍 Locate":
-        return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
-    return gr.update(visible=False)
-def select_boxes(task: str):
-    if task == "📍 Locate":
-        return gr.update(selected="tab_boxes")
-    return gr.update()
-def get_pdf_page_count(file_path: str) -> int:
-    if not file_path or not file_path.lower().endswith(".pdf"):
-        return 1
     doc = fitz.open(file_path)
-    count = len(doc)
     doc.close()
-    return count
-def load_image(file_path: str, page_num: int = 1):
     if not file_path:
         return None
-    if file_path.lower().endswith(".pdf"):
-        doc = fitz.open(file_path)
-        page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
-        page = doc.load_page(page_idx)
-        pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
-        img = Image.open(BytesIO(pix.tobytes("png")))
-        doc.close()
-        return img
-    return Image.open(file_path)
-def update_page_selector(file_path: str):
-    if not file_path:
-        return gr.update(visible=False)
-    if file_path.lower().endswith(".pdf"):
-        page_count = get_pdf_page_count(file_path)
-        return gr.update(
-            visible=True,
-            maximum=page_count,
-            value=1,
-            minimum=1,
-            label=f"Select Page (1-{page_count})",
-        )
-    return gr.update(visible=False)
-# =========================
-# UI
-# =========================
-with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR + TrOCR + BLIP") as demo:
-    gr.Markdown(
-        f"""
-# DeepSeek-OCR Demo (with TrOCR + BLIP)
-This app supports:
-- **Markdown**: DeepSeek-OCR (structured markdown + optional grounding boxes)
-- **Free OCR**: TrOCR (fast text-only OCR)
-- **Locate**: DeepSeek-OCR (grounding boxes)
-- **Describe**: BLIP (image captioning)
-Runtime device: **{DEVICE}**
-"""
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
-            input_img = gr.Image(label="Input Image", type="pil", height=300)
-            page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
-            mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
-            task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
-            prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
-            btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
-            with gr.Tabs() as tabs:
-                with gr.Tab("Text", id="tab_text"):
-                    text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
-                with gr.Tab("Markdown Preview", id="tab_markdown"):
-                    md_out = gr.Markdown("")
-                with gr.Tab("Boxes", id="tab_boxes"):
-                    img_out = gr.Image(type="pil", height=500, show_label=False)
-                with gr.Tab("Cropped Images", id="tab_crops"):
-                    gallery = gr.Gallery(show_label=False, columns=3, height=400)
-                with gr.Tab("Raw Text", id="tab_raw"):
-                    raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
-    # File / PDF page handling
-    file_in.change(load_image, [file_in, page_selector], [input_img])
-    file_in.change(update_page_selector, [file_in], [page_selector])
-    page_selector.change(load_image, [file_in, page_selector], [input_img])
-    # Prompt visibility and tab switch
-    task.change(toggle_prompt, [task], [prompt])
-    task.change(select_boxes, [task], [tabs])
-    def run(image, file_path, mode, task, custom_prompt, page_num):
-        if file_path:
-            return process_file(file_path, mode, task, custom_prompt, int(page_num))
-        if image is not None:
-            return process_image(image, mode, task, custom_prompt)
-        return "Error: upload file or image", "", "", None, []
-    submit_event = btn.click(
-        run,
-        [input_img, file_in, mode, task, prompt, page_selector],
-        [text_out, md_out, raw_out, img_out, gallery],
     )
-    submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import os
 import re
 import tempfile
+from io import BytesIO
+from typing import List, Tuple, Optional
 import gradio as gr
 import torch
 import fitz  # PyMuPDF
 from transformers import (
     AutoProcessor,
     VisionEncoderDecoderModel,
     BlipProcessor,
     BlipForConditionalGeneration,
 )
+# -------------------------
+# CPU-only setup
+# -------------------------
+DEVICE = torch.device("cpu")
+torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "4")))
+TROCR_NAME = os.getenv("TROCR_MODEL", "microsoft/trocr-base-printed")
+BLIP_NAME = os.getenv("BLIP_MODEL", "Salesforce/blip-image-captioning-base")
+# -------------------------
+# Models (CPU)
+# -------------------------
+trocr_processor = AutoProcessor.from_pretrained(TROCR_NAME)
+trocr_model = VisionEncoderDecoderModel.from_pretrained(TROCR_NAME).eval().to(DEVICE)
+blip_processor = BlipProcessor.from_pretrained(BLIP_NAME)
+blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_NAME).eval().to(DEVICE)
+# -------------------------
+# Optional: pytesseract (for boxes on images)
+# -------------------------
+def _try_import_tesseract():
     try:
+        import pytesseract  # type: ignore
+        # Quick sanity check: version call triggers binary lookup
+        _ = pytesseract.get_tesseract_version()
+        return pytesseract
     except Exception:
+        return None
+PYTESS = _try_import_tesseract()
+# -------------------------
+# UI / tasks
+# -------------------------
+TASKS = [
+    "OCR",
+    "Markdown",
+    "Locate",
+    "Describe",
+]
+DEFAULT_DPI = 200  # PDF render DPI
+# -------------------------
 # Helpers
+# -------------------------
+def _safe_font(size: int = 28):
     candidates = [
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
             if os.path.exists(p):
                 return ImageFont.truetype(p, size)
         except Exception:
+            pass
     return ImageFont.load_default()
+def _to_rgb(img: Image.Image) -> Image.Image:
+    if img.mode in ("RGBA", "LA", "P"):
+        img = img.convert("RGB")
+    return ImageOps.exif_transpose(img)
+def _tokenize(s: str) -> List[str]:
+    return re.findall(r"[A-Za-zА-Яа-я0-9]+", s.lower())
+def trocr_ocr(img: Image.Image) -> str:
+    img = _to_rgb(img)
+    inputs = trocr_processor(images=img, return_tensors="pt")
+    pixel_values = inputs.pixel_values.to(DEVICE)
+    with torch.no_grad():
+        ids = trocr_model.generate(pixel_values, max_new_tokens=256)
+    text = trocr_processor.batch_decode(ids, skip_special_tokens=True)[0]
+    return text.strip()
+def blip_describe(img: Image.Image) -> str:
+    img = _to_rgb(img)
+    inputs = blip_processor(images=img, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        out = blip_model.generate(**inputs, max_new_tokens=80)
+    return blip_processor.decode(out[0], skip_special_tokens=True).strip()
+def render_pdf_page(path: str, page_num: int, dpi: int = DEFAULT_DPI) -> Tuple[fitz.Document, fitz.Page, Image.Image, float]:
+    doc = fitz.open(path)
+    page_idx = max(0, min(page_num - 1, len(doc) - 1))
+    page = doc.load_page(page_idx)
+    zoom = dpi / 72.0
+    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
+    img = Image.open(BytesIO(pix.tobytes("png")))
+    return doc, page, img, zoom
+def pdf_has_text(page: fitz.Page) -> bool:
+    # words is empty for scanned pages
+    words = page.get_text("words")
+    return bool(words)
+def pdf_extract_text(page: fitz.Page) -> str:
+    txt = page.get_text("text") or ""
+    return txt.strip()
+def pdf_to_markdown_simple(page: fitz.Page) -> str:
+    """
+    Lightweight markdown for selectable-text PDFs.
+    - Uses span sizes to guess headers.
+    - No heavy layout logic (keeps it stable and fast on CPU).
+    """
+    data = page.get_text("dict")
+    spans = []
+    for b in data.get("blocks", []):
+        for ln in b.get("lines", []):
+            for sp in ln.get("spans", []):
+                t = (sp.get("text") or "").strip()
+                if t:
+                    spans.append(float(sp.get("size", 0.0)))
+    if not spans:
         return ""
+    med = float(np.median(spans))
+    h1_thr = med * 1.60
+    h2_thr = med * 1.35
+    lines_out: List[str] = []
+    for b in data.get("blocks", []):
+        if b.get("type") != 0:
+            continue
+        for ln in b.get("lines", []):
+            parts = []
+            sizes = []
+            for sp in ln.get("spans", []):
+                t = (sp.get("text") or "")
+                if t.strip():
+                    parts.append(t.strip())
+                    sizes.append(float(sp.get("size", 0.0)))
+            if not parts:
+                continue
+            line = " ".join(parts).strip()
+            sz = max(sizes) if sizes else med
+            if sz >= h1_thr:
+                lines_out.append("# " + line)
+            elif sz >= h2_thr:
+                lines_out.append("## " + line)
+            else:
+                lines_out.append(line)
+        lines_out.append("")  # paragraph break
+    md = "\n".join(lines_out).strip()
+    return md
+def draw_rects(img: Image.Image, rects_px: List[Tuple[int, int, int, int]]) -> Image.Image:
+    out = img.copy()
+    draw = ImageDraw.Draw(out)
+    overlay = Image.new("RGBA", out.size, (0, 0, 0, 0))
+    draw2 = ImageDraw.Draw(overlay)
+    for (x0, y0, x1, y1) in rects_px:
+        draw.rectangle([x0, y0, x1, y1], outline=(0, 160, 255), width=3)
+        draw2.rectangle([x0, y0, x1, y1], fill=(0, 160, 255, 60))
+    out.paste(overlay, (0, 0), overlay)
+    return out
+def locate_in_pdf_words(page: fitz.Page, query: str) -> List[Tuple[float, float, float, float]]:
+    """
+    Returns list of rectangles in PDF coordinate space (points).
+    Uses exact word sequence match (token-based).
+    """
+    q = _tokenize(query)
+    if not q:
+        return []
+    words = page.get_text("words")  # x0,y0,x1,y1,"word",block,line,wordno
+    if not words:
+        return []
+    w_tokens = [_tokenize(w[4])[0] if _tokenize(w[4]) else "" for w in words]
+    rects: List[Tuple[float, float, float, float]] = []
+    n = len(w_tokens)
+    m = len(q)
+    for i in range(0, n - m + 1):
+        if w_tokens[i:i + m] == q:
+            xs0 = [float(words[j][0]) for j in range(i, i + m)]
+            ys0 = [float(words[j][1]) for j in range(i, i + m)]
+            xs1 = [float(words[j][2]) for j in range(i, i + m)]
+            ys1 = [float(words[j][3]) for j in range(i, i + m)]
+            rects.append((min(xs0), min(ys0), max(xs1), max(ys1)))
+    return rects
+def locate_in_image_tesseract(img: Image.Image, query: str) -> Tuple[List[Tuple[int, int, int, int]], str]:
+    """
+    Returns pixel-space rectangles for located phrase, plus a short status message.
+    If pytesseract is not available, returns empty list and message.
+    """
+    if PYTESS is None:
+        return [], "Tesseract not available: no boxes for images."
+    q = _tokenize(query)
+    if not q:
+        return [], "Empty query."
+    img = _to_rgb(img)
+    # Use data dict so it works consistently
+    data = PYTESS.image_to_data(img, output_type=PYTESS.Output.DICT)
+    texts = data.get("text", [])
+    left = data.get("left", [])
+    top = data.get("top", [])
+    width = data.get("width", [])
+    height = data.get("height", [])
+    conf = data.get("conf", [])
+    tokens = []
+    boxes = []
+    for i, t in enumerate(texts):
+        t = (t or "").strip()
+        if not t:
+            continue
+        tok = _tokenize(t)
+        if not tok:
+            continue
+        # Keep only "reasonable" confidence if numeric
         try:
+            c = float(conf[i])
+            if c < 0:
+                continue
         except Exception:
             pass
+        tokens.append(tok[0])
+        boxes.append((int(left[i]), int(top[i]), int(left[i] + width[i]), int(top[i] + height[i])))
+    rects: List[Tuple[int, int, int, int]] = []
+    n = len(tokens)
+    m = len(q)
+    for i in range(0, n - m + 1):
+        if tokens[i:i + m] == q:
+            xs0 = [boxes[j][0] for j in range(i, i + m)]
+            ys0 = [boxes[j][1] for j in range(i, i + m)]
+            xs1 = [boxes[j][2] for j in range(i, i + m)]
+            ys1 = [boxes[j][3] for j in range(i, i + m)]
+            rects.append((min(xs0), min(ys0), max(xs1), max(ys1)))
+    if not rects:
+        return [], "Not found."
+    return rects, "Found."
+def as_markdown_block(text: str) -> str:
+    if not text.strip():
+        return ""
+    return "```text\n" + text.strip() + "\n```"
+# -------------------------
+# Main run
+# -------------------------
+def process(path: str, task: str, page_num: int, query: str):
     if not path:
+        return "Upload a file.", "", None
+    ext = os.path.splitext(path)[1].lower()
+    # ---------- PDF ----------
+    if ext == ".pdf":
+        doc, page, page_img, zoom = render_pdf_page(path, page_num, dpi=DEFAULT_DPI)
+        try:
+            if task == "Describe":
+                caption = blip_describe(page_img)
+                return caption, as_markdown_block(caption), None
+            if task == "OCR":
+                if pdf_has_text(page):
+                    txt = pdf_extract_text(page)
+                else:
+                    txt = trocr_ocr(page_img)
+                return txt, as_markdown_block(txt), None
+            if task == "Markdown":
+                if pdf_has_text(page):
+                    md = pdf_to_markdown_simple(page)
+                    if not md:
+                        txt = pdf_extract_text(page)
+                        md = as_markdown_block(txt)
+                else:
+                    txt = trocr_ocr(page_img)
+                    md = as_markdown_block(txt)
+                return md, md, None
+            if task == "Locate":
+                if not query.strip():
+                    return "Enter text to locate.", "", page_img
+                # 1) Prefer precise PDF word boxes (selectable text)
+                rects_pdf = locate_in_pdf_words(page, query)
+                if rects_pdf:
+                    # Convert PDF points -> pixels using same render zoom
+                    rects_px = []
+                    for (x0, y0, x1, y1) in rects_pdf:
+                        rects_px.append((int(x0 * zoom), int(y0 * zoom), int(x1 * zoom), int(y1 * zoom)))
+                    boxed = draw_rects(page_img, rects_px)
+                    return "Found.", "", boxed
+                # 2) Fallback: if scanned page, try tesseract boxes on rendered image
+                rects_px, msg = locate_in_image_tesseract(page_img, query)
+                boxed = draw_rects(page_img, rects_px) if rects_px else page_img
+                return msg, "", boxed
+            return "Unknown task.", "", None
+        finally:
+            doc.close()
+    # ---------- Image ----------
+    img = _to_rgb(Image.open(path))
+    if task == "Describe":
+        caption = blip_describe(img)
+        return caption, as_markdown_block(caption), None
+    if task == "OCR":
+        txt = trocr_ocr(img)
+        return txt, as_markdown_block(txt), None
+    if task == "Markdown":
+        txt = trocr_ocr(img)
+        md = as_markdown_block(txt)
+        return md, md, None
+    if task == "Locate":
+        if not query.strip():
+            return "Enter text to locate.", "", img
+        rects_px, msg = locate_in_image_tesseract(img, query)
+        boxed = draw_rects(img, rects_px) if rects_px else img
+        return msg, "", boxed
+    return "Unknown task.", "", None
+# -------------------------
+# UI helpers
+# -------------------------
+def update_page_selector(file_path: str):
+    if not file_path:
+        return gr.update(visible=False), gr.update(value=None)
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext != ".pdf":
+        return gr.update(visible=False), gr.update(value=_to_rgb(Image.open(file_path)))
     doc = fitz.open(file_path)
+    pages = len(doc)
     doc.close()
+    # Show first page preview
+    _, _, img, _ = render_pdf_page(file_path, 1, dpi=DEFAULT_DPI)
+    return (
+        gr.update(visible=True, minimum=1, maximum=max(1, pages), value=1),
+        gr.update(value=img),
+    )
+def update_preview(file_path: str, page_num: int):
     if not file_path:
         return None
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext != ".pdf":
+        return _to_rgb(Image.open(file_path))
+    _, _, img, _ = render_pdf_page(file_path, int(page_num), dpi=DEFAULT_DPI)
+    return img
+def toggle_query(task: str):
+    return gr.update(visible=(task == "Locate"))
+# -------------------------
+# Build app (minimal style)
+# -------------------------
+theme = gr.themes.Base(
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"],
+)
+with gr.Blocks(theme=theme, title="Doc Tool (CPU)") as demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=320):
+            file_in = gr.File(label="File", file_types=["image", ".pdf"], type="filepath")
+            page_num = gr.Slider(label="Page", minimum=1, maximum=1, value=1, step=1, visible=False)
+            task = gr.Dropdown(label="Task", choices=TASKS, value="OCR")
+            query = gr.Textbox(label="Query", visible=False, placeholder="Text to locate")
+            run_btn = gr.Button("Run", variant="primary")
         with gr.Column(scale=2):
+            preview = gr.Image(label="Preview", type="pil", height=360)
+            out_text = gr.Textbox(label="Output", lines=10)
+            out_md = gr.Markdown()
+            out_boxes = gr.Image(label="Boxes", type="pil", height=360)
+    file_in.change(update_page_selector, inputs=[file_in], outputs=[page_num, preview])
+    page_num.change(update_preview, inputs=[file_in, page_num], outputs=[preview])
+    task.change(toggle_query, inputs=[task], outputs=[query])
+    def on_run(file_path, task_name, page, q):
+        text, md, boxed = process(file_path, task_name, int(page), q or "")
+        return text, md, boxed
+    run_btn.click(
+        on_run,
+        inputs=[file_in, task, page_num, query],
+        outputs=[out_text, out_md, out_boxes],
     )
 if __name__ == "__main__":
+    # Disable SSR to avoid extra startup noise
+    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)