Spaces:
Sleeping
Sleeping
| """ | |
| Book Cover Analyzer - Web-Ready Version | |
| Refactored V4 for Flask integration | |
| """ | |
| import cv2 | |
| import numpy as np | |
| import easyocr | |
| import torch | |
| import torchvision.models as models | |
| import torchvision.transforms as transforms | |
| from PIL import Image | |
| import warnings | |
| import imutils | |
| warnings.filterwarnings('ignore') | |
| class BookCoverAnalyzer: | |
| """ | |
| Complete book cover analysis pipeline. | |
| ML CONCEPT: Stateful Service | |
| ============================ | |
| This class is designed for web deployment: | |
| - Initialize once (loads models into memory) | |
| - Process many images (reuse loaded models) | |
| - Thread-safe design (can handle multiple requests) | |
| Benefits: | |
| - Fast: Models loaded once, not per request | |
| - Memory efficient: Shared model weights | |
| - Production-ready: Error handling included | |
| """ | |
| def __init__(self, verbose=False): | |
| """ | |
| Initialize analyzer with all ML models. | |
| ML CONCEPT: Model Loading Strategy | |
| =================================== | |
| We load ALL models at startup (not lazy): | |
| - EasyOCR: Text detection + recognition | |
| - PyTorch ResNet18: Image classification | |
| Why load at startup? | |
| - Web server starts once | |
| - First request doesn't have cold-start delay | |
| - Consistent response times | |
| """ | |
| self.verbose = verbose | |
| if self.verbose: | |
| print("[INFO] Initializing Book Cover Analyzer...") | |
| # Load EasyOCR | |
| if self.verbose: | |
| print("[INFO] Loading EasyOCR...") | |
| self.ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False) | |
| # Load PyTorch model | |
| if self.verbose: | |
| print("[INFO] Loading PyTorch ResNet18...") | |
| self.pytorch_model = models.resnet18(weights='DEFAULT') | |
| self.pytorch_model.eval() | |
| # Create preprocessing pipeline | |
| self.transform = transforms.Compose([ | |
| transforms.Resize(256), | |
| transforms.CenterCrop(224), | |
| transforms.ToTensor(), | |
| transforms.Normalize( | |
| mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225] | |
| ) | |
| ]) | |
| # Load ImageNet class labels | |
| import os | |
| labels_path = os.path.join(os.path.dirname(__file__), 'imagenet_classes.txt') | |
| try: | |
| with open(labels_path, 'r') as f: | |
| self.imagenet_classes = [line.strip() for line in f.readlines()] | |
| except FileNotFoundError: | |
| if self.verbose: | |
| print("[WARNING] ImageNet classes file not found, using generic labels") | |
| self.imagenet_classes = None | |
| if self.verbose: | |
| print("[SUCCESS] All models loaded!") | |
| def analyze(self, image_path): | |
| """ | |
| Complete analysis pipeline. | |
| Args: | |
| image_path: Path to book cover image | |
| Returns: | |
| dict: { | |
| 'success': bool, | |
| 'book_extracted': bool, | |
| 'text_regions': [...], | |
| 'image_regions': [...], | |
| 'summary': {...}, | |
| 'error': str (if failed) | |
| } | |
| ML CONCEPT: Pipeline Architecture | |
| ================================== | |
| Stage 1: OpenCV (Classical CV) - Find book | |
| Stage 2: EasyOCR (DL) - Detect & read text | |
| Stage 3: PyTorch (DL) - Classify images | |
| Each stage can fail independently. | |
| We return partial results + error status. | |
| """ | |
| try: | |
| # Load image | |
| image = cv2.imread(image_path) | |
| if image is None: | |
| return { | |
| 'success': False, | |
| 'error': 'Could not load image' | |
| } | |
| # Stage 1: Find and extract book | |
| book_contour = self._find_book(image) | |
| if book_contour is None: | |
| # Fallback: If book detection fails, process whole image | |
| # This handles complex scenes where edge detection struggles | |
| if self.verbose: | |
| print("[INFO] Book detection failed, processing entire image") | |
| book_image = image | |
| book_extracted = False | |
| else: | |
| # Successfully detected book boundary | |
| book_image = self._extract_book(image, book_contour) | |
| book_extracted = True | |
| # Stage 2: Text detection and recognition | |
| text_regions = self._detect_text(book_image) | |
| # Stage 3: Image classification | |
| image_regions = self._classify_images(book_image, text_regions) | |
| # Stage 4: Generate human-readable interpretation | |
| interpretation = self._generate_interpretation(text_regions, image_regions) | |
| # Summary statistics | |
| summary = { | |
| 'total_text_regions': len(text_regions), | |
| 'total_image_regions': len(image_regions), | |
| 'book_dimensions': f"{book_image.shape[1]}x{book_image.shape[0]}" | |
| } | |
| return { | |
| 'success': True, | |
| 'book_extracted': book_extracted, | |
| 'text_regions': text_regions, | |
| 'image_regions': image_regions, | |
| 'summary': summary, | |
| 'interpretation': interpretation | |
| } | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'error': f'Analysis failed: {str(e)}' | |
| } | |
| def _generate_interpretation(self, text_regions, image_regions): | |
| """ | |
| Generate human-readable interpretation of analysis results. | |
| ML CONCEPT: Post-Processing and Interpretation | |
| =============================================== | |
| Raw ML outputs need human-readable summaries. | |
| This method: | |
| 1. Combines all detected text into readable format | |
| 2. Describes what types of images were found | |
| 3. Provides context for non-technical users | |
| This bridges the gap between ML predictions and user understanding. | |
| """ | |
| interpretation = {} | |
| # Text interpretation with smart inference | |
| if text_regions: | |
| # Combine all text sorted by position (top to bottom) | |
| sorted_texts = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x'])) | |
| all_text = [r['text'] for r in sorted_texts] | |
| # High confidence text only (> 70%) | |
| high_conf_text = [r['text'] for r in sorted_texts if r['confidence'] > 0.7] | |
| interpretation['full_text'] = ' '.join(all_text) | |
| interpretation['high_confidence_text'] = ' '.join(high_conf_text) | |
| interpretation['word_count'] = len(all_text) | |
| # Smart inference: Infer book title, author, publisher | |
| interpretation.update(self._infer_book_metadata(text_regions)) | |
| else: | |
| interpretation['full_text'] = "No text detected" | |
| interpretation['high_confidence_text'] = "" | |
| interpretation['word_count'] = 0 | |
| interpretation['inferred_title'] = None | |
| interpretation['inferred_authors'] = [] | |
| interpretation['inferred_publisher'] = None | |
| interpretation['other_text'] = [] | |
| # Image interpretation | |
| if image_regions: | |
| image_descriptions = [] | |
| for img in image_regions: | |
| location = img.get('location', 'unknown') | |
| confidence = img['confidence'] * 100 | |
| class_name = img['pytorch_class'] | |
| image_descriptions.append({ | |
| 'location': location.replace('_', ' ').title(), | |
| 'classification': class_name, | |
| 'confidence': f"{confidence:.1f}%", | |
| 'description': f"Found visual element in {location.replace('_', ' ')} area (classified as {class_name} with {confidence:.1f}% confidence)" | |
| }) | |
| interpretation['images'] = image_descriptions | |
| interpretation['image_summary'] = f"Detected {len(image_regions)} visual elements/illustrations on the cover" | |
| else: | |
| interpretation['images'] = [] | |
| interpretation['image_summary'] = "No distinct image regions detected (cover may be text-only or require closer inspection)" | |
| # Overall interpretation | |
| if text_regions and image_regions: | |
| interpretation['cover_type'] = "Mixed - Contains both text and visual elements" | |
| elif text_regions and not image_regions: | |
| interpretation['cover_type'] = "Text-heavy - Primarily text-based design" | |
| elif image_regions and not text_regions: | |
| interpretation['cover_type'] = "Visual-heavy - Primarily image-based design" | |
| else: | |
| interpretation['cover_type'] = "Unknown - Analysis incomplete" | |
| return interpretation | |
| def _infer_book_metadata(self, text_regions): | |
| """ | |
| Infer book title, author, and publisher from detected text. | |
| ML CONCEPT: Heuristic-Based Inference | |
| ====================================== | |
| Uses simple rules to guess book metadata: | |
| - Title: Largest text, usually at top | |
| - Author: Often capitalized names, medium-large text | |
| - Publisher: Smaller text, often at bottom | |
| - Reviews/Quotes: Text with quotation marks | |
| This is NOT ML - it's rule-based heuristics! | |
| For better accuracy, would use NER (Named Entity Recognition). | |
| """ | |
| import re | |
| sorted_regions = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x'])) | |
| # Categorize by size and position | |
| large_text = [r for r in text_regions if r['bbox']['height'] > 50] # Very large (title candidates) | |
| medium_text = [r for r in text_regions if 30 < r['bbox']['height'] <= 50] # Medium (author/subtitle) | |
| small_text = [r for r in text_regions if r['bbox']['height'] <= 30] # Small (publisher, quotes) | |
| # Sort by vertical position | |
| large_text.sort(key=lambda r: r['bbox']['y']) | |
| medium_text.sort(key=lambda r: r['bbox']['y']) | |
| small_text.sort(key=lambda r: r['bbox']['y']) | |
| result = {} | |
| # Infer Title: Largest text at top | |
| if large_text: | |
| title_parts = [r['text'] for r in large_text[:2]] # Top 2 largest | |
| result['inferred_title'] = ' '.join(title_parts) | |
| else: | |
| result['inferred_title'] = None | |
| # Infer Authors: Look for capitalized names | |
| authors = [] | |
| for region in medium_text + large_text: | |
| text = region['text'] | |
| # Check if mostly uppercase (likely author name) | |
| if text.isupper() and len(text) > 3: | |
| # Avoid common words like "THE", "OF" | |
| if text not in ['THE', 'OF', 'AND', 'IN', 'A', 'AN']: | |
| authors.append(text.title()) # Convert to title case | |
| # Also check for name patterns (First Last) | |
| for region in text_regions: | |
| text = region['text'] | |
| # Pattern: Two or more capitalized words | |
| words = text.split() | |
| if len(words) >= 2 and all(w[0].isupper() if w else False for w in words): | |
| if text not in authors and len(text) > 5: | |
| authors.append(text) | |
| result['inferred_authors'] = list(set(authors))[:3] # Dedupe, max 3 | |
| # Infer Publisher: Small text at bottom | |
| bottom_small = [r for r in small_text if r['bbox']['y'] > sum([reg['bbox']['y'] for reg in text_regions]) / len(text_regions)] | |
| publisher_candidates = [] | |
| for region in bottom_small: | |
| text = region['text'] | |
| # Skip quotes, numbers, common words | |
| if not any(char in text for char in ['"', "'", '«', '»']) and not text.isdigit(): | |
| if len(text) > 3: | |
| publisher_candidates.append(text) | |
| result['inferred_publisher'] = publisher_candidates[0] if publisher_candidates else None | |
| # Collect other meaningful text (reviews, quotes, etc.) | |
| other_text = [] | |
| for region in text_regions: | |
| text = region['text'] | |
| # Text with quotes (likely reviews) | |
| if any(char in text for char in ['"', "'", '«', '»', 'Compelling', 'Fascinating']): | |
| other_text.append(text) | |
| result['other_text'] = other_text[:5] # Max 5 items | |
| return result | |
| def _find_book(self, image): | |
| """ | |
| Stage 1: Find book using edge detection with fallback. | |
| ML CONCEPT: Classical Computer Vision with Robustness | |
| ====================================================== | |
| Uses hand-crafted algorithms (not ML): | |
| - Canny edge detection with multiple thresholds | |
| - Contour finding | |
| - Shape analysis (aspect ratio, area) | |
| - Fallback strategy for real-world photos | |
| IMPROVEMENT: Added fallback for books in complex scenes | |
| (e.g., book on desk with other objects) | |
| """ | |
| gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| blurred = cv2.GaussianBlur(gray, (5, 5), 0) | |
| # Try multiple edge detection thresholds | |
| edge_params = [ | |
| (50, 150), # Original | |
| (30, 100), # More sensitive | |
| (75, 200), # Less sensitive | |
| ] | |
| for low_thresh, high_thresh in edge_params: | |
| edged = cv2.Canny(blurred, low_thresh, high_thresh) | |
| contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, | |
| cv2.CHAIN_APPROX_SIMPLE) | |
| contours = imutils.grab_contours(contours) | |
| contours = sorted(contours, key=cv2.contourArea, reverse=True) | |
| # First pass: Strict criteria (ideal conditions) | |
| for contour in contours[:15]: | |
| peri = cv2.arcLength(contour, True) | |
| approx = cv2.approxPolyDP(contour, 0.02 * peri, True) | |
| area = cv2.contourArea(contour) | |
| x, y, w, h = cv2.boundingRect(contour) | |
| aspect_ratio = float(w) / h if h > 0 else 0 | |
| image_area = image.shape[0] * image.shape[1] | |
| area_percentage = (area / image_area) * 100 | |
| # Strict criteria: Clean book photos | |
| if (area_percentage > 10 and 0.4 < aspect_ratio < 1.2 | |
| and len(approx) >= 4): | |
| return contour | |
| # Second pass: Relaxed criteria (real-world photos) | |
| for contour in contours[:20]: | |
| peri = cv2.arcLength(contour, True) | |
| approx = cv2.approxPolyDP(contour, 0.02 * peri, True) | |
| area = cv2.contourArea(contour) | |
| x, y, w, h = cv2.boundingRect(contour) | |
| aspect_ratio = float(w) / h if h > 0 else 0 | |
| image_area = image.shape[0] * image.shape[1] | |
| area_percentage = (area / image_area) * 100 | |
| # Relaxed criteria: Books in complex scenes | |
| # - Lower area threshold (5% instead of 10%) | |
| # - Wider aspect ratio range (0.3 to 1.5) | |
| # - Require rectangular shape (4 corners) | |
| if (area_percentage > 5 and 0.3 < aspect_ratio < 1.5 | |
| and len(approx) >= 4 and area > 5000): | |
| return contour | |
| # Fallback: Use largest rectangular contour with reasonable size | |
| # This handles cases where book edges aren't perfect | |
| edged = cv2.Canny(blurred, 30, 100) | |
| contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, | |
| cv2.CHAIN_APPROX_SIMPLE) | |
| contours = imutils.grab_contours(contours) | |
| contours = sorted(contours, key=cv2.contourArea, reverse=True) | |
| image_area = image.shape[0] * image.shape[1] | |
| for contour in contours[:25]: | |
| peri = cv2.arcLength(contour, True) | |
| approx = cv2.approxPolyDP(contour, 0.02 * peri, True) | |
| area = cv2.contourArea(contour) | |
| area_percentage = (area / image_area) * 100 | |
| # Absolute minimum: Any rectangular shape with meaningful size | |
| # - Must be at least 3% of image (not tiny labels/logos) | |
| # - Must be at least 20,000 pixels (prevents tiny regions) | |
| if len(approx) >= 4 and area > 20000 and area_percentage > 3: | |
| x, y, w, h = cv2.boundingRect(contour) | |
| aspect_ratio = float(w) / h if h > 0 else 0 | |
| # Basic book-like shape | |
| # - Minimum width/height to avoid tiny regions | |
| if 0.3 < aspect_ratio < 1.8 and w > 100 and h > 100: | |
| return contour | |
| return None | |
| def _extract_book(self, image, contour): | |
| """Extract book region with perspective correction.""" | |
| peri = cv2.arcLength(contour, True) | |
| approx = cv2.approxPolyDP(contour, 0.02 * peri, True) | |
| if len(approx) == 4: | |
| # Apply perspective transform | |
| book_region = self._four_point_transform(image, approx.reshape(4, 2)) | |
| else: | |
| # Use bounding box | |
| x, y, w, h = cv2.boundingRect(contour) | |
| book_region = image[y:y+h, x:x+w] | |
| return book_region | |
| def _four_point_transform(self, image, pts): | |
| """Perspective transform helper.""" | |
| rect = self._order_points(pts) | |
| (tl, tr, br, bl) = rect | |
| widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2)) | |
| widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2)) | |
| maxWidth = max(int(widthA), int(widthB)) | |
| heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2)) | |
| heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2)) | |
| maxHeight = max(int(heightA), int(heightB)) | |
| dst = np.array([ | |
| [0, 0], | |
| [maxWidth - 1, 0], | |
| [maxWidth - 1, maxHeight - 1], | |
| [0, maxHeight - 1] | |
| ], dtype="float32") | |
| M = cv2.getPerspectiveTransform(rect, dst) | |
| warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight)) | |
| return warped | |
| def _order_points(self, pts): | |
| """Order points clockwise.""" | |
| rect = np.zeros((4, 2), dtype="float32") | |
| s = pts.sum(axis=1) | |
| rect[0] = pts[np.argmin(s)] | |
| rect[2] = pts[np.argmax(s)] | |
| diff = np.diff(pts, axis=1) | |
| rect[1] = pts[np.argmin(diff)] | |
| rect[3] = pts[np.argmax(diff)] | |
| return rect | |
| def _detect_text(self, book_image): | |
| """ | |
| Stage 2: Text detection and recognition using EasyOCR. | |
| ML CONCEPT: End-to-End Deep Learning | |
| ===================================== | |
| EasyOCR uses TWO neural networks: | |
| 1. CRAFT: Detects WHERE text is (bounding boxes) | |
| 2. Recognition Net: Reads WHAT text says (OCR) | |
| Both are pre-trained, no training needed! | |
| """ | |
| results = self.ocr_reader.readtext(book_image, detail=1) | |
| text_regions = [] | |
| for idx, (bbox, text, confidence) in enumerate(results): | |
| bbox_array = np.array(bbox, dtype=np.int32) | |
| x = int(np.min(bbox_array[:, 0])) | |
| y = int(np.min(bbox_array[:, 1])) | |
| w = int(np.max(bbox_array[:, 0]) - x) | |
| h = int(np.max(bbox_array[:, 1]) - y) | |
| text_regions.append({ | |
| 'id': idx + 1, | |
| 'text': text, | |
| 'bbox': {'x': x, 'y': y, 'width': w, 'height': h}, | |
| 'confidence': float(confidence), | |
| 'type': 'text' | |
| }) | |
| return text_regions | |
| def _classify_images(self, book_image, text_regions): | |
| """ | |
| Stage 3: Image classification using PyTorch. | |
| ML CONCEPT: Transfer Learning (Zero-shot) + Multi-Region Sampling | |
| ================================================================== | |
| Strategy: Sample multiple regions across the book cover | |
| - Center region (main illustration) | |
| - Top corners (logos, badges, icons) | |
| - Bottom corners (publisher logos, barcodes) | |
| IMPROVEMENT: No longer skips regions with text overlay! | |
| Books often have illustrations WITH text on them. | |
| """ | |
| height, width = book_image.shape[:2] | |
| # Define regions to sample (multiple locations) | |
| sample_regions = [ | |
| # Center - main illustration area | |
| { | |
| 'name': 'center', | |
| 'x1': max(0, width // 2 - 150), | |
| 'y1': max(0, height // 2 - 150), | |
| 'x2': min(width, width // 2 + 150), | |
| 'y2': min(height, height // 2 + 150) | |
| }, | |
| # Top-left corner | |
| { | |
| 'name': 'top_left', | |
| 'x1': 10, | |
| 'y1': 10, | |
| 'x2': min(width, 150), | |
| 'y2': min(height, 150) | |
| }, | |
| # Top-right corner | |
| { | |
| 'name': 'top_right', | |
| 'x1': max(0, width - 150), | |
| 'y1': 10, | |
| 'x2': width - 10, | |
| 'y2': min(height, 150) | |
| }, | |
| # Bottom-left corner | |
| { | |
| 'name': 'bottom_left', | |
| 'x1': 10, | |
| 'y1': max(0, height - 150), | |
| 'x2': min(width, 150), | |
| 'y2': height - 10 | |
| }, | |
| # Bottom-right corner | |
| { | |
| 'name': 'bottom_right', | |
| 'x1': max(0, width - 150), | |
| 'y1': max(0, height - 150), | |
| 'x2': width - 10, | |
| 'y2': height - 10 | |
| } | |
| ] | |
| image_regions = [] | |
| region_id = len(text_regions) + 1 | |
| for sample in sample_regions: | |
| x1, y1 = sample['x1'], sample['y1'] | |
| x2, y2 = sample['x2'], sample['y2'] | |
| # Skip if region is too small | |
| if x2 - x1 < 50 or y2 - y1 < 50: | |
| continue | |
| # Extract region | |
| region_crop = book_image[y1:y2, x1:x2] | |
| # Check if region has enough visual content (not just solid color) | |
| gray = cv2.cvtColor(region_crop, cv2.COLOR_BGR2GRAY) | |
| std_dev = np.std(gray) | |
| # Only classify if there's visual complexity (not blank/solid color) | |
| if std_dev > 15: # Threshold for visual complexity | |
| result = self._classify_single_image(region_crop) | |
| image_regions.append({ | |
| 'id': region_id, | |
| 'location': sample['name'], | |
| 'bbox': {'x': x1, 'y': y1, 'width': x2-x1, 'height': y2-y1}, | |
| 'pytorch_class': result['class_name'], | |
| 'confidence': result['confidence'], | |
| 'top_5': result['top_5'], | |
| 'type': 'image' | |
| }) | |
| region_id += 1 | |
| return image_regions | |
| def _classify_single_image(self, opencv_image): | |
| """ | |
| Classify single image region with PyTorch. | |
| ML CONCEPT: Inference Pipeline | |
| =============================== | |
| 1. Convert BGR → RGB → PIL | |
| 2. Resize, crop, normalize (preprocessing) | |
| 3. Convert to tensor, add batch dim | |
| 4. Forward pass through ResNet18 | |
| 5. Softmax to get probabilities | |
| 6. Return top predictions | |
| """ | |
| # Convert OpenCV (BGR) → PIL (RGB) | |
| rgb = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB) | |
| pil_image = Image.fromarray(rgb) | |
| # Preprocess | |
| tensor = self.transform(pil_image) | |
| tensor = tensor.unsqueeze(0) # Add batch dimension | |
| # Inference | |
| with torch.no_grad(): | |
| outputs = self.pytorch_model(tensor) | |
| probs = torch.nn.functional.softmax(outputs[0], dim=0) | |
| # Get top 5 | |
| top_probs, top_indices = torch.topk(probs, 5) | |
| top_class_id = top_indices[0].item() | |
| top_confidence = top_probs[0].item() | |
| # Get actual ImageNet class name | |
| if self.imagenet_classes and top_class_id < len(self.imagenet_classes): | |
| class_name = self.imagenet_classes[top_class_id] | |
| else: | |
| class_name = f"element_{top_class_id}" | |
| top_5 = [ | |
| { | |
| 'class_id': idx.item(), | |
| 'class_name': self.imagenet_classes[idx.item()] if self.imagenet_classes and idx.item() < len(self.imagenet_classes) else f"class_{idx.item()}", | |
| 'confidence': prob.item() | |
| } | |
| for idx, prob in zip(top_indices, top_probs) | |
| ] | |
| return { | |
| 'class_id': top_class_id, | |
| 'class_name': class_name, | |
| 'confidence': float(top_confidence), | |
| 'top_5': top_5 | |
| } | |