Skip to main content

Three layers: a short note at the top, the key lines with our take in the middle, the full source at the bottom.

Source

preprocess.py

The OCR preprocessing step. The exact pipeline that runs on every uploaded image before reading.

Repo path services/docling/preprocess.pyLanguage Python

Short note — more on the way

What this is

The OCR preprocessing step. The exact pipeline that runs on every uploaded image before reading.

What it proves

This file backs one or more of the privacy promises. It is a source file that lives versioned in the repository. Read the promise →

What to look for in the source below

  • Comments and headers that name what each section does.
  • File edges: imports at the top, exports or run-blocks at the bottom.
  • Any list, configuration, or assertion that looks load-bearing.
Show the full file (506 lines)

505 lines

"""Image preprocessing for photographed receipts (runs before OCR).

The target user photographs a paper receipt on a prep table, so the
image arrives rotated, skewed, perspective-distorted, shadowed, and
low-contrast. docling/RapidOCR sees raw pixels with no cleanup today
(converter.py only upscales 2x + full-page-OCRs), so a phone photo
extracts poorly. This module is the fix: a deterministic, in-memory
cleanup pipeline that runs on IMAGE inputs (JPEG/PNG/WebP) before the
OCR engine sees them. PDFs and scans bypass or get only a light touch.

Design rules (see runbooks/proposals/photo-preprocessing.md):

  * Deterministic. Same bytes -> same pixels -> same OCR. We pin
    OpenCV to a single thread and use no RNG, so the attestation /
    reproducibility story holds.
  * Never worse than the original. Cleanup is an *enhancement*; if a
    caller-supplied scorer says the processed image is less legible
    than the original, we return the original. (The real scorer is
    OCR-token-count, supplied at integration; the module itself takes
    no position and ships the hook.)
  * Never crop blind. Perspective dewarp only fires when a confident
    page quad is found; otherwise we skip it rather than risk cropping
    a line item.
  * docling only registers InputFormat.PDF, so the cleaned raster is
    wrapped into a single-page PDF -- this module doubles as the
    image->PDF adapter the service was missing.

This is pure document-AI (geometry + contrast math), NO language
model and no network -- consistent with converter.py's privacy note.
"""

from __future__ import annotations

import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, List, Optional, Tuple

import cv2
import numpy as np
from PIL import Image, ImageOps

# HEIC/HEIF is the iPhone default photo format, so the target user
# (a chef photographing receipts on an iPhone) hits it constantly.
# docling/RapidOCR cannot read it; registering the HEIF opener lets
# Pillow decode it transparently inside load_oriented_gray. The
# import is guarded so a dev env without the wheel still loads this
# module (HEIC just won't decode there). pillow-heif is tracked in
# the startup attestation since it touches tenant pixels.
try:
    from pillow_heif import register_heif_opener

    register_heif_opener()
    _HEIF_AVAILABLE = True
except Exception:  # pragma: no cover - dev envs without the wheel
    _HEIF_AVAILABLE = False

# Determinism + concurrency discipline: a single OpenCV thread keeps
# output bit-stable across runs and stops BLAS/OpenMP fan-out from
# thrashing the 2-CPU box under the soft_limit=4 concurrency model.
# Set at import so every code path inherits it.
cv2.setNumThreads(1)

# The owner-legible stages a demo can surface, in pipeline order. The
# internal op ids (left) stay out of user copy; the demo maps them to
# friendly labels.
DEFAULT_OPS: Tuple[str, ...] = (
    "autorotate",  # EXIF orientation (applied at load)
    "dewarp",  # find the page + flatten perspective
    "deskew",  # rotate to level
    "denoise",  # clear speckle
    "illumination",  # even out shadows/glare
    "contrast",  # boost faint print (CLAHE)
)

# Bytes that mean "not a photo we should clean": digital PDFs already
# carry a text layer, TIFFs are typically flatbed scans. Both bypass.
_PDF_MAGIC = b"%PDF-"
_TIFF_MAGICS = (b"II*\x00", b"MM\x00*")
# ISO Base Media (HEIC/HEIF) brands -- mirrors the Worker's
# detectFormat in apps/api/src/lib/ingest-format.ts.
_HEIF_BRANDS = (b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1")


def sniff_kind(header: bytes) -> str:
    """Classify by magic bytes. Returns 'pdf', 'tiff', 'image', or
    'unknown'. Only 'image' is preprocessed."""
    if header.startswith(_PDF_MAGIC):
        return "pdf"
    if any(header.startswith(m) for m in _TIFF_MAGICS):
        return "tiff"
    if header.startswith(b"\xff\xd8\xff"):  # JPEG
        return "image"
    if header.startswith(b"\x89PNG\r\n\x1a\n"):  # PNG
        return "image"
    if header[:4] == b"RIFF" and header[8:12] == b"WEBP":  # WebP
        return "image"
    # HEIC/HEIF: bytes 4-7 = "ftyp", bytes 8-11 = a known brand.
    if len(header) >= 12 and header[4:8] == b"ftyp" and header[8:12] in _HEIF_BRANDS:
        return "image"
    return "unknown"


def _env_flag(name: str, default: bool) -> bool:
    raw = os.environ.get(name)
    if raw is None:
        return default
    return raw.strip().lower() in ("1", "true", "yes", "on")


def _env_float(name: str, default: float) -> float:
    try:
        return float(os.environ.get(name, str(default)))
    except (TypeError, ValueError):
        return default


def _env_int(name: str, default: int) -> int:
    try:
        return int(os.environ.get(name, str(default)))
    except (TypeError, ValueError):
        return default


@dataclass(frozen=True)
class PreprocessConfig:
    """Tunables, mirroring the DOCLING_* env convention in
    converter.py. Ships dark: ``enabled`` defaults to False so the
    module can land + be tested without changing production behaviour
    until an env flip turns it on."""

    enabled: bool = False
    ops: Tuple[str, ...] = DEFAULT_OPS
    max_pixels: int = 24_000_000
    deskew_max_deg: float = 15.0
    clahe_clip: float = 2.0
    clahe_grid: int = 8
    binarize: bool = False
    never_worse: bool = True

    @classmethod
    def from_env(cls) -> "PreprocessConfig":
        ops_raw = os.environ.get("DOCLING_PREPROCESS_OPS")
        ops = (
            tuple(o.strip() for o in ops_raw.split(",") if o.strip())
            if ops_raw
            else DEFAULT_OPS
        )
        return cls(
            enabled=_env_flag("DOCLING_PREPROCESS_ENABLED", False),
            ops=ops,
            max_pixels=_env_int("DOCLING_PREPROCESS_MAX_PIXELS", 24_000_000),
            deskew_max_deg=_env_float("DOCLING_PREPROCESS_DESKEW_MAX_DEG", 15.0),
            clahe_clip=_env_float("DOCLING_PREPROCESS_CLAHE_CLIP", 2.0),
            clahe_grid=_env_int("DOCLING_PREPROCESS_CLAHE_GRID", 8),
            binarize=_env_flag("DOCLING_PREPROCESS_BINARIZE", False),
            never_worse=_env_flag("DOCLING_PREPROCESS_NEVER_WORSE", True),
        )


@dataclass
class PreprocessResult:
    """The cleaned image plus a record of what happened, for logging
    and for authoring the demo's honest before/after art."""

    image: np.ndarray  # final single-channel (grayscale) uint8
    original: np.ndarray  # post-EXIF, post-downscale grayscale baseline
    applied: List[str] = field(default_factory=list)
    stages: List[Tuple[str, np.ndarray]] = field(default_factory=list)
    skew_deg: float = 0.0
    dewarped: bool = False
    used_original: bool = False


# ----------------------------------------------------------------- load


def load_oriented_gray(path: str) -> np.ndarray:
    """Decode an image, apply its EXIF orientation, return grayscale
    uint8. EXIF orientation MUST be first: a sideways phone photo has
    upright sensor pixels + a rotate flag, and every later step assumes
    upright content."""
    with Image.open(path) as im:
        im = ImageOps.exif_transpose(im)
        gray = im.convert("L")
        arr = np.asarray(gray, dtype=np.uint8)
    return np.ascontiguousarray(arr)


def _downscale(gray: np.ndarray, max_pixels: int) -> np.ndarray:
    h, w = gray.shape[:2]
    if h * w <= max_pixels:
        return gray
    scale = (max_pixels / float(h * w)) ** 0.5
    new_w = max(1, int(w * scale))
    new_h = max(1, int(h * scale))
    return cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_AREA)


# -------------------------------------------------------------- dewarp


def _order_quad(pts: np.ndarray) -> np.ndarray:
    """Order 4 points as top-left, top-right, bottom-right, bottom-left."""
    pts = pts.reshape(4, 2).astype(np.float32)
    s = pts.sum(axis=1)
    diff = np.diff(pts, axis=1).ravel()
    return np.array(
        [
            pts[np.argmin(s)],  # tl: smallest x+y
            pts[np.argmin(diff)],  # tr: smallest y-x
            pts[np.argmax(s)],  # br: largest x+y
            pts[np.argmax(diff)],  # bl: largest y-x
        ],
        dtype=np.float32,
    )


def find_document_quad(gray: np.ndarray) -> Optional[np.ndarray]:
    """Find the largest convex 4-vertex contour that plausibly is the
    page. Returns the ordered quad in full-resolution coordinates, or
    None when no confident page boundary is found (caller then skips
    dewarp -- never crop blind)."""
    h, w = gray.shape[:2]
    frame_area = float(h * w)
    # Work on a downscaled copy for speed + edge stability.
    long_edge = max(h, w)
    scale = 1000.0 / long_edge if long_edge > 1000 else 1.0
    small = (
        cv2.resize(gray, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
        if scale < 1.0
        else gray
    )
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)
    edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), iterations=1)
    contours, _ = cv2.findContours(
        edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )
    small_area = float(small.shape[0] * small.shape[1])
    best: Optional[np.ndarray] = None
    for cnt in sorted(contours, key=cv2.contourArea, reverse=True)[:5]:
        peri = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
        if len(approx) != 4 or not cv2.isContourConvex(approx):
            continue
        # Require the quad to cover a real fraction of the frame, else
        # it is probably a logo box or a line-item cell, not the page.
        if cv2.contourArea(approx) < 0.25 * small_area:
            continue
        best = approx.astype(np.float32) / scale
        break
    if best is None:
        return None
    quad = _order_quad(best)
    # Final sanity: the quad's area in full-res must still be a real
    # fraction of the frame.
    if cv2.contourArea(quad) < 0.25 * frame_area:
        return None
    return quad


def four_point_warp(gray: np.ndarray, quad: np.ndarray) -> np.ndarray:
    """Warp the quad to a fronto-parallel rectangle sized from its own
    edge lengths."""
    tl, tr, br, bl = quad
    width = int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl)))
    height = int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl)))
    width = max(width, 1)
    height = max(height, 1)
    dst = np.array(
        [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]],
        dtype=np.float32,
    )
    m = cv2.getPerspectiveTransform(quad, dst)
    return cv2.warpPerspective(gray, m, (width, height))


# -------------------------------------------------------------- deskew


def estimate_skew(gray: np.ndarray) -> float:
    """Estimate the page skew in degrees via minAreaRect over the ink
    mask. Positive = content rotated counter-clockwise (needs a
    clockwise correction). Returns 0.0 when there is not enough ink to
    judge."""
    thresh = cv2.threshold(
        gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )[1]
    coords = cv2.findNonZero(thresh)
    if coords is None or len(coords) < 50:
        return 0.0
    angle = cv2.minAreaRect(coords)[-1]
    # Resolve the ±90 ambiguity convention-agnostically (some OpenCV
    # builds report (0,90], others [-90,0]) and collapse to the
    # minimal signed CORRECTION to pass to rotate(): map into (-45,45].
    angle = angle % 90
    if angle > 45:
        angle -= 90
    return float(angle)


def rotate(gray: np.ndarray, angle_deg: float) -> np.ndarray:
    """Rotate about the center, expanding the canvas so no content is
    clipped. Border filled with white (paper)."""
    h, w = gray.shape[:2]
    center = (w / 2.0, h / 2.0)
    m = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
    cos = abs(m[0, 0])
    sin = abs(m[0, 1])
    new_w = int(h * sin + w * cos)
    new_h = int(h * cos + w * sin)
    m[0, 2] += (new_w / 2.0) - center[0]
    m[1, 2] += (new_h / 2.0) - center[1]
    return cv2.warpAffine(
        gray,
        m,
        (new_w, new_h),
        flags=cv2.INTER_CUBIC,
        borderMode=cv2.BORDER_CONSTANT,
        borderValue=255,
    )


# --------------------------------------------------- denoise / contrast


def denoise(gray: np.ndarray) -> np.ndarray:
    """Conservative non-local-means denoise. Deterministic; small h so
    faint thermal strokes survive."""
    return cv2.fastNlMeansDenoising(gray, h=7, templateWindowSize=7, searchWindowSize=21)


def normalize_illumination(gray: np.ndarray) -> np.ndarray:
    """Flatten shadows/glare by dividing out a morphological background
    estimate. The big win on crumpled/shadowed phone shots."""
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (31, 31))
    background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    # Avoid divide-by-zero; scale back to 0..255.
    background = np.where(background == 0, 1, background).astype(np.uint8)
    norm = cv2.divide(gray, background, scale=255)
    return norm.astype(np.uint8)


def apply_clahe(gray: np.ndarray, clip: float, grid: int) -> np.ndarray:
    """Tile-local contrast (CLAHE). Recovers faint regions without
    blowing out bright ones."""
    clahe = cv2.createCLAHE(clipLimit=clip, tileGridSize=(grid, grid))
    return clahe.apply(gray)


def binarize(gray: np.ndarray) -> np.ndarray:
    """Adaptive (Gaussian) threshold. Off by default -- aggressive
    binarization kills faint thermal print, so it is opt-in for
    pathological low-contrast scans only."""
    return cv2.adaptiveThreshold(
        gray,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=25,
        C=10,
    )


def legibility(gray: np.ndarray) -> float:
    """A cheap sharpness proxy (variance of the Laplacian). Used for
    diagnostics + tests, NOT as the never-worse arbiter (denoise
    legitimately lowers it); the real arbiter is an OCR-based scorer
    supplied by the caller."""
    return float(cv2.Laplacian(gray, cv2.CV_64F).var())


# ------------------------------------------------------- orchestration


def preprocess_image(
    path: str,
    config: PreprocessConfig,
    *,
    scorer: Optional[Callable[[np.ndarray], float]] = None,
) -> PreprocessResult:
    """Run the configured pipeline on an image file. Returns the
    cleaned grayscale image plus a record of stages.

    ``scorer`` is the never-worse arbiter: a callable mapping an image
    to a legibility score (e.g. OCR token count). When ``config.
    never_worse`` is set and a scorer is supplied, the original image
    is returned instead of the processed one if processing did not
    improve the score. With no scorer the module takes no position and
    returns the processed image (the integration layer supplies the
    OCR-based scorer)."""
    gray = load_oriented_gray(path)
    gray = _downscale(gray, config.max_pixels)
    original = gray.copy()

    result = PreprocessResult(image=gray, original=original)
    result.applied.append("autorotate")
    result.stages.append(("autorotate", gray.copy()))

    work = gray

    if "dewarp" in config.ops:
        quad = find_document_quad(work)
        if quad is not None:
            work = four_point_warp(work, quad)
            result.dewarped = True
            result.applied.append("dewarp")
            result.stages.append(("dewarp", work.copy()))

    if "deskew" in config.ops:
        angle = estimate_skew(work)
        if abs(angle) <= config.deskew_max_deg and abs(angle) >= 0.1:
            work = rotate(work, angle)
            result.skew_deg = angle
            result.applied.append("deskew")
            result.stages.append(("deskew", work.copy()))

    if "denoise" in config.ops:
        work = denoise(work)
        result.applied.append("denoise")
        result.stages.append(("denoise", work.copy()))

    if "illumination" in config.ops:
        work = normalize_illumination(work)
        result.applied.append("illumination")
        result.stages.append(("illumination", work.copy()))

    if "contrast" in config.ops:
        work = apply_clahe(work, config.clahe_clip, config.clahe_grid)
        result.applied.append("contrast")
        result.stages.append(("contrast", work.copy()))

    if config.binarize:
        work = binarize(work)
        result.applied.append("binarize")
        result.stages.append(("binarize", work.copy()))

    # Never worse than the original: only fires with a real scorer.
    if config.never_worse and scorer is not None:
        if scorer(work) < scorer(original):
            result.used_original = True
            result.image = original
            return result

    result.image = work
    return result


def wrap_to_pdf(gray: np.ndarray, out_path: str) -> str:
    """Wrap a single grayscale image into a one-page PDF. docling only
    registers InputFormat.PDF, so every cleaned raster is normalized to
    PDF here. Deterministic: no timestamp/metadata is written."""
    img = Image.fromarray(gray, mode="L")
    # Pin all metadata so the PDF bytes are deterministic: Pillow
    # otherwise writes the source filename into /Title and the current
    # time into /CreationDate + /ModDate. A fixed epoch + empty
    # title/producer make the same pixels produce identical bytes.
    fixed_date = "D:19700101000000Z"
    img.save(
        out_path,
        format="PDF",
        resolution=200.0,
        title="",
        producer="",
        creator="",
        creationDate=fixed_date,
        modDate=fixed_date,
    )
    return out_path


def preprocess_to_pdf(
    src_path: str,
    *,
    out_dir: str = "/tmp",
    config: Optional[PreprocessConfig] = None,
    scorer: Optional[Callable[[np.ndarray], float]] = None,
) -> Tuple[str, Optional[PreprocessResult]]:
    """Top-level entry the converter calls. Returns
    ``(path, result)``:

      * ``(src_path, None)`` when preprocessing is disabled or the
        input is not a photo (PDF/TIFF/unknown) -- the caller proceeds
        with the original file, unchanged. NEVER mutates ``src_path``.
      * ``(new_pdf_path, result)`` when an image was cleaned -- a new
        single-page PDF in ``out_dir`` that the caller MUST clean up.
    """
    cfg = config or PreprocessConfig.from_env()
    if not cfg.enabled:
        return src_path, None
    try:
        with open(src_path, "rb") as f:
            header = f.read(16)
    except OSError:
        return src_path, None
    if sniff_kind(header) != "image":
        return src_path, None

    result = preprocess_image(src_path, cfg, scorer=scorer)
    fd, out_path = tempfile.mkstemp(suffix=".pdf", dir=out_dir)
    os.close(fd)
    wrap_to_pdf(result.image, out_path)
    return out_path, result

This is the file as it lives at the moment of this build. The canonical history lives in git. If you want the full history or a specific commit, write to hello@muntin.digital.

preprocess.py · Verify · Muntin Ledger · Muntin