Three layers: a short note at the top, the key lines with our take in the middle, the full source at the bottom.
Source
preprocess.py
The OCR preprocessing step. The exact pipeline that runs on every uploaded image before reading.
Repo path services/docling/preprocess.pyLanguage Python
Short note — more on the way
What this is
The OCR preprocessing step. The exact pipeline that runs on every uploaded image before reading.
What it proves
This file backs one or more of the privacy promises. It is a source file that lives versioned in the repository. Read the promise →
What to look for in the source below
- Comments and headers that name what each section does.
- File edges: imports at the top, exports or run-blocks at the bottom.
- Any list, configuration, or assertion that looks load-bearing.
Show the full file (506 lines)
505 lines
"""Image preprocessing for photographed receipts (runs before OCR).
The target user photographs a paper receipt on a prep table, so the
image arrives rotated, skewed, perspective-distorted, shadowed, and
low-contrast. docling/RapidOCR sees raw pixels with no cleanup today
(converter.py only upscales 2x + full-page-OCRs), so a phone photo
extracts poorly. This module is the fix: a deterministic, in-memory
cleanup pipeline that runs on IMAGE inputs (JPEG/PNG/WebP) before the
OCR engine sees them. PDFs and scans bypass or get only a light touch.
Design rules (see runbooks/proposals/photo-preprocessing.md):
* Deterministic. Same bytes -> same pixels -> same OCR. We pin
OpenCV to a single thread and use no RNG, so the attestation /
reproducibility story holds.
* Never worse than the original. Cleanup is an *enhancement*; if a
caller-supplied scorer says the processed image is less legible
than the original, we return the original. (The real scorer is
OCR-token-count, supplied at integration; the module itself takes
no position and ships the hook.)
* Never crop blind. Perspective dewarp only fires when a confident
page quad is found; otherwise we skip it rather than risk cropping
a line item.
* docling only registers InputFormat.PDF, so the cleaned raster is
wrapped into a single-page PDF -- this module doubles as the
image->PDF adapter the service was missing.
This is pure document-AI (geometry + contrast math), NO language
model and no network -- consistent with converter.py's privacy note.
"""
from __future__ import annotations
import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, List, Optional, Tuple
import cv2
import numpy as np
from PIL import Image, ImageOps
# HEIC/HEIF is the iPhone default photo format, so the target user
# (a chef photographing receipts on an iPhone) hits it constantly.
# docling/RapidOCR cannot read it; registering the HEIF opener lets
# Pillow decode it transparently inside load_oriented_gray. The
# import is guarded so a dev env without the wheel still loads this
# module (HEIC just won't decode there). pillow-heif is tracked in
# the startup attestation since it touches tenant pixels.
try:
from pillow_heif import register_heif_opener
register_heif_opener()
_HEIF_AVAILABLE = True
except Exception: # pragma: no cover - dev envs without the wheel
_HEIF_AVAILABLE = False
# Determinism + concurrency discipline: a single OpenCV thread keeps
# output bit-stable across runs and stops BLAS/OpenMP fan-out from
# thrashing the 2-CPU box under the soft_limit=4 concurrency model.
# Set at import so every code path inherits it.
cv2.setNumThreads(1)
# The owner-legible stages a demo can surface, in pipeline order. The
# internal op ids (left) stay out of user copy; the demo maps them to
# friendly labels.
DEFAULT_OPS: Tuple[str, ...] = (
"autorotate", # EXIF orientation (applied at load)
"dewarp", # find the page + flatten perspective
"deskew", # rotate to level
"denoise", # clear speckle
"illumination", # even out shadows/glare
"contrast", # boost faint print (CLAHE)
)
# Bytes that mean "not a photo we should clean": digital PDFs already
# carry a text layer, TIFFs are typically flatbed scans. Both bypass.
_PDF_MAGIC = b"%PDF-"
_TIFF_MAGICS = (b"II*\x00", b"MM\x00*")
# ISO Base Media (HEIC/HEIF) brands -- mirrors the Worker's
# detectFormat in apps/api/src/lib/ingest-format.ts.
_HEIF_BRANDS = (b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1")
def sniff_kind(header: bytes) -> str:
"""Classify by magic bytes. Returns 'pdf', 'tiff', 'image', or
'unknown'. Only 'image' is preprocessed."""
if header.startswith(_PDF_MAGIC):
return "pdf"
if any(header.startswith(m) for m in _TIFF_MAGICS):
return "tiff"
if header.startswith(b"\xff\xd8\xff"): # JPEG
return "image"
if header.startswith(b"\x89PNG\r\n\x1a\n"): # PNG
return "image"
if header[:4] == b"RIFF" and header[8:12] == b"WEBP": # WebP
return "image"
# HEIC/HEIF: bytes 4-7 = "ftyp", bytes 8-11 = a known brand.
if len(header) >= 12 and header[4:8] == b"ftyp" and header[8:12] in _HEIF_BRANDS:
return "image"
return "unknown"
def _env_flag(name: str, default: bool) -> bool:
raw = os.environ.get(name)
if raw is None:
return default
return raw.strip().lower() in ("1", "true", "yes", "on")
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, str(default)))
except (TypeError, ValueError):
return default
def _env_int(name: str, default: int) -> int:
try:
return int(os.environ.get(name, str(default)))
except (TypeError, ValueError):
return default
@dataclass(frozen=True)
class PreprocessConfig:
"""Tunables, mirroring the DOCLING_* env convention in
converter.py. Ships dark: ``enabled`` defaults to False so the
module can land + be tested without changing production behaviour
until an env flip turns it on."""
enabled: bool = False
ops: Tuple[str, ...] = DEFAULT_OPS
max_pixels: int = 24_000_000
deskew_max_deg: float = 15.0
clahe_clip: float = 2.0
clahe_grid: int = 8
binarize: bool = False
never_worse: bool = True
@classmethod
def from_env(cls) -> "PreprocessConfig":
ops_raw = os.environ.get("DOCLING_PREPROCESS_OPS")
ops = (
tuple(o.strip() for o in ops_raw.split(",") if o.strip())
if ops_raw
else DEFAULT_OPS
)
return cls(
enabled=_env_flag("DOCLING_PREPROCESS_ENABLED", False),
ops=ops,
max_pixels=_env_int("DOCLING_PREPROCESS_MAX_PIXELS", 24_000_000),
deskew_max_deg=_env_float("DOCLING_PREPROCESS_DESKEW_MAX_DEG", 15.0),
clahe_clip=_env_float("DOCLING_PREPROCESS_CLAHE_CLIP", 2.0),
clahe_grid=_env_int("DOCLING_PREPROCESS_CLAHE_GRID", 8),
binarize=_env_flag("DOCLING_PREPROCESS_BINARIZE", False),
never_worse=_env_flag("DOCLING_PREPROCESS_NEVER_WORSE", True),
)
@dataclass
class PreprocessResult:
"""The cleaned image plus a record of what happened, for logging
and for authoring the demo's honest before/after art."""
image: np.ndarray # final single-channel (grayscale) uint8
original: np.ndarray # post-EXIF, post-downscale grayscale baseline
applied: List[str] = field(default_factory=list)
stages: List[Tuple[str, np.ndarray]] = field(default_factory=list)
skew_deg: float = 0.0
dewarped: bool = False
used_original: bool = False
# ----------------------------------------------------------------- load
def load_oriented_gray(path: str) -> np.ndarray:
"""Decode an image, apply its EXIF orientation, return grayscale
uint8. EXIF orientation MUST be first: a sideways phone photo has
upright sensor pixels + a rotate flag, and every later step assumes
upright content."""
with Image.open(path) as im:
im = ImageOps.exif_transpose(im)
gray = im.convert("L")
arr = np.asarray(gray, dtype=np.uint8)
return np.ascontiguousarray(arr)
def _downscale(gray: np.ndarray, max_pixels: int) -> np.ndarray:
h, w = gray.shape[:2]
if h * w <= max_pixels:
return gray
scale = (max_pixels / float(h * w)) ** 0.5
new_w = max(1, int(w * scale))
new_h = max(1, int(h * scale))
return cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_AREA)
# -------------------------------------------------------------- dewarp
def _order_quad(pts: np.ndarray) -> np.ndarray:
"""Order 4 points as top-left, top-right, bottom-right, bottom-left."""
pts = pts.reshape(4, 2).astype(np.float32)
s = pts.sum(axis=1)
diff = np.diff(pts, axis=1).ravel()
return np.array(
[
pts[np.argmin(s)], # tl: smallest x+y
pts[np.argmin(diff)], # tr: smallest y-x
pts[np.argmax(s)], # br: largest x+y
pts[np.argmax(diff)], # bl: largest y-x
],
dtype=np.float32,
)
def find_document_quad(gray: np.ndarray) -> Optional[np.ndarray]:
"""Find the largest convex 4-vertex contour that plausibly is the
page. Returns the ordered quad in full-resolution coordinates, or
None when no confident page boundary is found (caller then skips
dewarp -- never crop blind)."""
h, w = gray.shape[:2]
frame_area = float(h * w)
# Work on a downscaled copy for speed + edge stability.
long_edge = max(h, w)
scale = 1000.0 / long_edge if long_edge > 1000 else 1.0
small = (
cv2.resize(gray, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
if scale < 1.0
else gray
)
blurred = cv2.GaussianBlur(small, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), iterations=1)
contours, _ = cv2.findContours(
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
small_area = float(small.shape[0] * small.shape[1])
best: Optional[np.ndarray] = None
for cnt in sorted(contours, key=cv2.contourArea, reverse=True)[:5]:
peri = cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
if len(approx) != 4 or not cv2.isContourConvex(approx):
continue
# Require the quad to cover a real fraction of the frame, else
# it is probably a logo box or a line-item cell, not the page.
if cv2.contourArea(approx) < 0.25 * small_area:
continue
best = approx.astype(np.float32) / scale
break
if best is None:
return None
quad = _order_quad(best)
# Final sanity: the quad's area in full-res must still be a real
# fraction of the frame.
if cv2.contourArea(quad) < 0.25 * frame_area:
return None
return quad
def four_point_warp(gray: np.ndarray, quad: np.ndarray) -> np.ndarray:
"""Warp the quad to a fronto-parallel rectangle sized from its own
edge lengths."""
tl, tr, br, bl = quad
width = int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl)))
height = int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl)))
width = max(width, 1)
height = max(height, 1)
dst = np.array(
[[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]],
dtype=np.float32,
)
m = cv2.getPerspectiveTransform(quad, dst)
return cv2.warpPerspective(gray, m, (width, height))
# -------------------------------------------------------------- deskew
def estimate_skew(gray: np.ndarray) -> float:
"""Estimate the page skew in degrees via minAreaRect over the ink
mask. Positive = content rotated counter-clockwise (needs a
clockwise correction). Returns 0.0 when there is not enough ink to
judge."""
thresh = cv2.threshold(
gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
)[1]
coords = cv2.findNonZero(thresh)
if coords is None or len(coords) < 50:
return 0.0
angle = cv2.minAreaRect(coords)[-1]
# Resolve the ±90 ambiguity convention-agnostically (some OpenCV
# builds report (0,90], others [-90,0]) and collapse to the
# minimal signed CORRECTION to pass to rotate(): map into (-45,45].
angle = angle % 90
if angle > 45:
angle -= 90
return float(angle)
def rotate(gray: np.ndarray, angle_deg: float) -> np.ndarray:
"""Rotate about the center, expanding the canvas so no content is
clipped. Border filled with white (paper)."""
h, w = gray.shape[:2]
center = (w / 2.0, h / 2.0)
m = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
cos = abs(m[0, 0])
sin = abs(m[0, 1])
new_w = int(h * sin + w * cos)
new_h = int(h * cos + w * sin)
m[0, 2] += (new_w / 2.0) - center[0]
m[1, 2] += (new_h / 2.0) - center[1]
return cv2.warpAffine(
gray,
m,
(new_w, new_h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_CONSTANT,
borderValue=255,
)
# --------------------------------------------------- denoise / contrast
def denoise(gray: np.ndarray) -> np.ndarray:
"""Conservative non-local-means denoise. Deterministic; small h so
faint thermal strokes survive."""
return cv2.fastNlMeansDenoising(gray, h=7, templateWindowSize=7, searchWindowSize=21)
def normalize_illumination(gray: np.ndarray) -> np.ndarray:
"""Flatten shadows/glare by dividing out a morphological background
estimate. The big win on crumpled/shadowed phone shots."""
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (31, 31))
background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
# Avoid divide-by-zero; scale back to 0..255.
background = np.where(background == 0, 1, background).astype(np.uint8)
norm = cv2.divide(gray, background, scale=255)
return norm.astype(np.uint8)
def apply_clahe(gray: np.ndarray, clip: float, grid: int) -> np.ndarray:
"""Tile-local contrast (CLAHE). Recovers faint regions without
blowing out bright ones."""
clahe = cv2.createCLAHE(clipLimit=clip, tileGridSize=(grid, grid))
return clahe.apply(gray)
def binarize(gray: np.ndarray) -> np.ndarray:
"""Adaptive (Gaussian) threshold. Off by default -- aggressive
binarization kills faint thermal print, so it is opt-in for
pathological low-contrast scans only."""
return cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=25,
C=10,
)
def legibility(gray: np.ndarray) -> float:
"""A cheap sharpness proxy (variance of the Laplacian). Used for
diagnostics + tests, NOT as the never-worse arbiter (denoise
legitimately lowers it); the real arbiter is an OCR-based scorer
supplied by the caller."""
return float(cv2.Laplacian(gray, cv2.CV_64F).var())
# ------------------------------------------------------- orchestration
def preprocess_image(
path: str,
config: PreprocessConfig,
*,
scorer: Optional[Callable[[np.ndarray], float]] = None,
) -> PreprocessResult:
"""Run the configured pipeline on an image file. Returns the
cleaned grayscale image plus a record of stages.
``scorer`` is the never-worse arbiter: a callable mapping an image
to a legibility score (e.g. OCR token count). When ``config.
never_worse`` is set and a scorer is supplied, the original image
is returned instead of the processed one if processing did not
improve the score. With no scorer the module takes no position and
returns the processed image (the integration layer supplies the
OCR-based scorer)."""
gray = load_oriented_gray(path)
gray = _downscale(gray, config.max_pixels)
original = gray.copy()
result = PreprocessResult(image=gray, original=original)
result.applied.append("autorotate")
result.stages.append(("autorotate", gray.copy()))
work = gray
if "dewarp" in config.ops:
quad = find_document_quad(work)
if quad is not None:
work = four_point_warp(work, quad)
result.dewarped = True
result.applied.append("dewarp")
result.stages.append(("dewarp", work.copy()))
if "deskew" in config.ops:
angle = estimate_skew(work)
if abs(angle) <= config.deskew_max_deg and abs(angle) >= 0.1:
work = rotate(work, angle)
result.skew_deg = angle
result.applied.append("deskew")
result.stages.append(("deskew", work.copy()))
if "denoise" in config.ops:
work = denoise(work)
result.applied.append("denoise")
result.stages.append(("denoise", work.copy()))
if "illumination" in config.ops:
work = normalize_illumination(work)
result.applied.append("illumination")
result.stages.append(("illumination", work.copy()))
if "contrast" in config.ops:
work = apply_clahe(work, config.clahe_clip, config.clahe_grid)
result.applied.append("contrast")
result.stages.append(("contrast", work.copy()))
if config.binarize:
work = binarize(work)
result.applied.append("binarize")
result.stages.append(("binarize", work.copy()))
# Never worse than the original: only fires with a real scorer.
if config.never_worse and scorer is not None:
if scorer(work) < scorer(original):
result.used_original = True
result.image = original
return result
result.image = work
return result
def wrap_to_pdf(gray: np.ndarray, out_path: str) -> str:
"""Wrap a single grayscale image into a one-page PDF. docling only
registers InputFormat.PDF, so every cleaned raster is normalized to
PDF here. Deterministic: no timestamp/metadata is written."""
img = Image.fromarray(gray, mode="L")
# Pin all metadata so the PDF bytes are deterministic: Pillow
# otherwise writes the source filename into /Title and the current
# time into /CreationDate + /ModDate. A fixed epoch + empty
# title/producer make the same pixels produce identical bytes.
fixed_date = "D:19700101000000Z"
img.save(
out_path,
format="PDF",
resolution=200.0,
title="",
producer="",
creator="",
creationDate=fixed_date,
modDate=fixed_date,
)
return out_path
def preprocess_to_pdf(
src_path: str,
*,
out_dir: str = "/tmp",
config: Optional[PreprocessConfig] = None,
scorer: Optional[Callable[[np.ndarray], float]] = None,
) -> Tuple[str, Optional[PreprocessResult]]:
"""Top-level entry the converter calls. Returns
``(path, result)``:
* ``(src_path, None)`` when preprocessing is disabled or the
input is not a photo (PDF/TIFF/unknown) -- the caller proceeds
with the original file, unchanged. NEVER mutates ``src_path``.
* ``(new_pdf_path, result)`` when an image was cleaned -- a new
single-page PDF in ``out_dir`` that the caller MUST clean up.
"""
cfg = config or PreprocessConfig.from_env()
if not cfg.enabled:
return src_path, None
try:
with open(src_path, "rb") as f:
header = f.read(16)
except OSError:
return src_path, None
if sniff_kind(header) != "image":
return src_path, None
result = preprocess_image(src_path, cfg, scorer=scorer)
fd, out_path = tempfile.mkstemp(suffix=".pdf", dir=out_dir)
os.close(fd)
wrap_to_pdf(result.image, out_path)
return out_path, resultThis is the file as it lives at the moment of this build. The canonical history lives in git. If you want the full history or a specific commit, write to hello@muntin.digital.