Spaces:

Rogue2003
/

Receipt_Agent

Running

App Files Files Community

Raghu commited on 15 days ago

Commit

23980e2

1 Parent(s): 53ff1f6

Enhance OCR: add Tesseract fallback, better preprocessing, improved retry logic

Browse files

Files changed (2) hide show

app.py +247 -34
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -369,10 +369,16 @@ class EnsembleDocumentClassifier:
 # ============================================================================
 class ReceiptOCR:
-    """EasyOCR wrapper with retry logic."""
     def __init__(self):
         self.reader = None
     def load(self):
         if self.reader is None:
@@ -381,14 +387,162 @@ class ReceiptOCR:
             print("EasyOCR ready")
         return self
-    def extract_with_positions(self, image, min_confidence=0.3):
         if self.reader is None:
             self.load()
         if isinstance(image, Image.Image):
             image = np.array(image)
-        results = self.reader.readtext(image)
         extracted = []
         for bbox, text, conf in results:
@@ -396,15 +550,59 @@ class ReceiptOCR:
                 x_coords = [p[0] for p in bbox]
                 y_coords = [p[1] for p in bbox]
                 extracted.append({
-                    'text': text,
                     'confidence': conf,
-                    'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
                 })
         return extracted
     def postprocess_receipt(self, ocr_results):
-        """Extract structured fields from OCR results."""
         full_text = ' '.join([r['text'] for r in ocr_results])
         fields = {
@@ -417,49 +615,64 @@ class ReceiptOCR:
         return fields
     def _extract_vendor(self, ocr_results):
-        if ocr_results:
-            # Usually first line is vendor
-            return ocr_results[0]['text']
-        return None
     def _extract_date(self, text):
         patterns = [
-            r'\d{1,2}/\d{1,2}/\d{2,4}',
-            r'\d{1,2}-\d{1,2}-\d{2,4}',
-            r'\d{4}-\d{2}-\d{2}',
         ]
         for pattern in patterns:
-            match = re.search(pattern, text)
-            if match:
-                return match.group()
         return None
     def _extract_total(self, text):
         patterns = [
-            r'TOTAL[:\s]*\$?(\d+\.?\d*)',
-            r'AMOUNT[:\s]*\$?(\d+\.?\d*)',
-            r'DUE[:\s]*\$?(\d+\.?\d*)',
         ]
         for pattern in patterns:
-            match = re.search(pattern, text, re.IGNORECASE)
-            if match:
-                return match.group(1)
-        # Find largest dollar amount
-        amounts = re.findall(r'\$(\d+\.\d{2})', text)
-        if amounts:
-            return max(amounts, key=float)
         return None
     def _extract_time(self, text):
-        pattern = r'\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AP]M)?'
-        match = re.search(pattern, text, re.IGNORECASE)
-        return match.group() if match else None
-# ============================================================================
-# LayoutLMv3 Field Extractor
-# ============================================================================
 class LayoutLMFieldExtractor:
     """LayoutLMv3-based field extractor using fine-tuned weights if available."""

 # ============================================================================
 class ReceiptOCR:
+    """Enhanced OCR with EasyOCR + Tesseract fallback, better preprocessing, and retry logic."""
     def __init__(self):
         self.reader = None
+        self.use_tesseract = False
+        try:
+            import pytesseract
+            self.use_tesseract = True
+        except ImportError:
+            pass
     def load(self):
         if self.reader is None:
             print("EasyOCR ready")
         return self
+    def _preprocess_image(self, image, method='enhance'):
+        """Apply image preprocessing to improve OCR accuracy."""
+        import cv2
+        if isinstance(image, Image.Image):
+            img_array = np.array(image)
+        else:
+            img_array = image.copy()
+        if method == 'enhance':
+            # Convert to grayscale if needed
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(gray)
+            # Denoise
+            denoised = cv2.fastNlMeansDenoising(enhanced, h=10)
+            # Convert back to RGB for EasyOCR
+            return cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
+        elif method == 'sharpen':
+            # Sharpen the image
+            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
+            if len(img_array.shape) == 3:
+                sharpened = cv2.filter2D(img_array, -1, kernel)
+            else:
+                gray = img_array
+                sharpened = cv2.filter2D(gray, -1, kernel)
+                sharpened = cv2.cvtColor(sharpened, cv2.COLOR_GRAY2RGB)
+            return sharpened
+        elif method == 'binarize':
+            # Adaptive thresholding
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                          cv2.THRESH_BINARY, 11, 2)
+            return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+        return img_array
+    def _extract_with_tesseract(self, image):
+        """Fallback OCR using Tesseract."""
+        if not self.use_tesseract:
+            return []
+        try:
+            import pytesseract
+            if isinstance(image, Image.Image):
+                pil_image = image.convert('RGB')
+            else:
+                pil_image = Image.fromarray(image).convert('RGB')
+            # Get detailed output with bounding boxes
+            data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
+            results = []
+            n_boxes = len(data['text'])
+            for i in range(n_boxes):
+                text = data['text'][i].strip()
+                conf = int(data['conf'][i])
+                if text and conf > 0:
+                    x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
+                    results.append({
+                        'text': text,
+                        'confidence': conf / 100.0,
+                        'bbox': [x, y, x+w, y+h],
+                        'engine': 'tesseract'
+                    })
+            return results
+        except Exception as e:
+            print(f"Tesseract OCR error: {e}")
+            return []
+    def _merge_ocr_results(self, easyocr_results, tesseract_results):
+        """Merge results from multiple OCR engines, preferring higher confidence."""
+        if not tesseract_results:
+            return easyocr_results
+        # Create a map of EasyOCR results by approximate position
+        merged = []
+        used_tesseract = set()
+        for easy_result in easyocr_results:
+            best_match = None
+            best_iou = 0
+            # Find best matching Tesseract result
+            for i, tess_result in enumerate(tesseract_results):
+                if i in used_tesseract:
+                    continue
+                # Simple IoU calculation
+                iou = self._compute_iou(easy_result['bbox'], tess_result['bbox'])
+                if iou > best_iou and iou > 0.3:  # 30% overlap threshold
+                    best_iou = iou
+                    best_match = (i, tess_result)
+            if best_match and best_match[1]['confidence'] > easy_result['confidence']:
+                # Use Tesseract result if it's more confident
+                merged.append(best_match[1])
+                used_tesseract.add(best_match[0])
+            else:
+                merged.append(easy_result)
+        # Add unused Tesseract results
+        for i, tess_result in enumerate(tesseract_results):
+            if i not in used_tesseract:
+                merged.append(tess_result)
+        return merged
+    def _compute_iou(self, box1, box2):
+        """Compute Intersection over Union for bounding boxes."""
+        x1_1, y1_1, x2_1, y2_1 = box1
+        x1_2, y1_2, x2_2, y2_2 = box2
+        xi1 = max(x1_1, x1_2)
+        yi1 = max(y1_1, y1_2)
+        xi2 = min(x2_1, x2_2)
+        yi2 = min(y2_1, y2_2)
+        inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+        box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
+        box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+        union_area = box1_area + box2_area - inter_area
+        return inter_area / union_area if union_area > 0 else 0
+    def extract_with_positions(self, image, min_confidence=0.3, use_fallback=True):
+        """Extract text with positions using EasyOCR + optional Tesseract fallback."""
         if self.reader is None:
             self.load()
+        original_image = image
         if isinstance(image, Image.Image):
             image = np.array(image)
+        # Try EasyOCR first
+        try:
+            results = self.reader.readtext(image)
+        except Exception as e:
+            print(f"EasyOCR error: {e}")
+            results = []
         extracted = []
         for bbox, text, conf in results:
                 x_coords = [p[0] for p in bbox]
                 y_coords = [p[1] for p in bbox]
                 extracted.append({
+                    'text': text.strip(),
                     'confidence': conf,
+                    'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
+                    'engine': 'easyocr'
                 })
+        # Check if we need fallback (low confidence or few results)
+        avg_confidence = np.mean([r['confidence'] for r in extracted]) if extracted else 0
+        needs_fallback = use_fallback and (len(extracted) < 3 or avg_confidence < 0.5)
+        if needs_fallback and self.use_tesseract:
+            # Try preprocessing + Tesseract
+            preprocessed = self._preprocess_image(original_image, method='enhance')
+            tesseract_results = self._extract_with_tesseract(preprocessed)
+            if tesseract_results:
+                # Merge results
+                extracted = self._merge_ocr_results(extracted, tesseract_results)
+        # If still poor results, try with preprocessing
+        if len(extracted) < 3 or avg_confidence < 0.4:
+            for method in ['enhance', 'sharpen']:
+                try:
+                    preprocessed = self._preprocess_image(original_image, method=method)
+                    retry_results = self.reader.readtext(preprocessed)
+                    retry_extracted = []
+                    for bbox, text, conf in retry_results:
+                        if conf >= min_confidence:
+                            x_coords = [p[0] for p in bbox]
+                            y_coords = [p[1] for p in bbox]
+                            retry_extracted.append({
+                                'text': text.strip(),
+                                'confidence': conf,
+                                'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
+                                'engine': 'easyocr'
+                            })
+                    # Use retry if it's better
+                    retry_avg = np.mean([r['confidence'] for r in retry_extracted]) if retry_extracted else 0
+                    if retry_avg > avg_confidence:
+                        extracted = retry_extracted
+                        break
+                except Exception as e:
+                    continue
+        # Sort by confidence (highest first)
+        extracted.sort(key=lambda x: x['confidence'], reverse=True)
         return extracted
     def postprocess_receipt(self, ocr_results):
+        """Extract structured fields from OCR results with improved patterns."""
         full_text = ' '.join([r['text'] for r in ocr_results])
         fields = {
         return fields
     def _extract_vendor(self, ocr_results):
+        """Extract vendor name, usually in first few lines."""
+        if not ocr_results:
+            return None
+        # Look for vendor in top 3 results (usually at top of receipt)
+        top_results = sorted(ocr_results, key=lambda x: x['bbox'][1])[:3]
+        for result in top_results:
+            text = result['text'].strip()
+            # Skip common non-vendor words
+            if text and len(text) > 2 and text.upper() not in ['TOTAL', 'DATE', 'TIME', 'RECEIPT', 'THANK', 'YOU']:
+                # Take longest text as vendor (usually company name)
+                if len(text) > 5:
+                    return text
+        return top_results[0]['text'] if top_results else None
     def _extract_date(self, text):
+        """Extract date with improved patterns."""
         patterns = [
+            r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',  # MM/DD/YYYY or MM-DD-YYYY
+            r'\b\d{4}[/-]\d{2}[/-]\d{2}\b',  # YYYY-MM-DD
+            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b',  # Month DD, YYYY
         ]
         for pattern in patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            if matches:
+                return matches[0]
         return None
     def _extract_total(self, text):
+        """Extract total amount with improved patterns."""
+        # Look for TOTAL, AMOUNT, DUE keywords
         patterns = [
+            r'(?:TOTAL|AMOUNT|DUE|BALANCE)[:\s]*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
+            r'\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',  # Any dollar amount
         ]
         for pattern in patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            if matches:
+                # Return largest amount (usually the total)
+                amounts = [float(m.replace(',', '')) for m in matches]
+                return f"{max(amounts):.2f}"
         return None
     def _extract_time(self, text):
+        """Extract time."""
+        patterns = [
+            r'\b(\d{1,2}):(\d{2})\s*(?:AM|PM)\b',
+            r'\b(\d{1,2}):(\d{2})\b',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(0)
+        return None
 class LayoutLMFieldExtractor:
     """LayoutLMv3-based field extractor using fine-tuned weights if available."""

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ numpy>=1.21.0
 scikit-learn>=1.0.0
 opencv-python-headless>=4.5.0

 scikit-learn>=1.0.0
 opencv-python-headless>=4.5.0
+pytesseract>=0.3.10