""" Book Cover Analyzer - Web-Ready Version Refactored V4 for Flask integration """ import cv2 import numpy as np import easyocr import torch import torchvision.models as models import torchvision.transforms as transforms from PIL import Image import warnings import imutils warnings.filterwarnings('ignore') class BookCoverAnalyzer: """ Complete book cover analysis pipeline. ML CONCEPT: Stateful Service ============================ This class is designed for web deployment: - Initialize once (loads models into memory) - Process many images (reuse loaded models) - Thread-safe design (can handle multiple requests) Benefits: - Fast: Models loaded once, not per request - Memory efficient: Shared model weights - Production-ready: Error handling included """ def __init__(self, verbose=False): """ Initialize analyzer with all ML models. ML CONCEPT: Model Loading Strategy =================================== We load ALL models at startup (not lazy): - EasyOCR: Text detection + recognition - PyTorch ResNet18: Image classification Why load at startup? - Web server starts once - First request doesn't have cold-start delay - Consistent response times """ self.verbose = verbose if self.verbose: print("[INFO] Initializing Book Cover Analyzer...") # Load EasyOCR if self.verbose: print("[INFO] Loading EasyOCR...") self.ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False) # Load PyTorch model if self.verbose: print("[INFO] Loading PyTorch ResNet18...") self.pytorch_model = models.resnet18(weights='DEFAULT') self.pytorch_model.eval() # Create preprocessing pipeline self.transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ]) # Load ImageNet class labels import os labels_path = os.path.join(os.path.dirname(__file__), 'imagenet_classes.txt') try: with open(labels_path, 'r') as f: self.imagenet_classes = [line.strip() for line in f.readlines()] except FileNotFoundError: if self.verbose: print("[WARNING] ImageNet classes file not found, using generic labels") self.imagenet_classes = None if self.verbose: print("[SUCCESS] All models loaded!") def analyze(self, image_path): """ Complete analysis pipeline. Args: image_path: Path to book cover image Returns: dict: { 'success': bool, 'book_extracted': bool, 'text_regions': [...], 'image_regions': [...], 'summary': {...}, 'error': str (if failed) } ML CONCEPT: Pipeline Architecture ================================== Stage 1: OpenCV (Classical CV) - Find book Stage 2: EasyOCR (DL) - Detect & read text Stage 3: PyTorch (DL) - Classify images Each stage can fail independently. We return partial results + error status. """ try: # Load image image = cv2.imread(image_path) if image is None: return { 'success': False, 'error': 'Could not load image' } # Stage 1: Find and extract book book_contour = self._find_book(image) if book_contour is None: # Fallback: If book detection fails, process whole image # This handles complex scenes where edge detection struggles if self.verbose: print("[INFO] Book detection failed, processing entire image") book_image = image book_extracted = False else: # Successfully detected book boundary book_image = self._extract_book(image, book_contour) book_extracted = True # Stage 2: Text detection and recognition text_regions = self._detect_text(book_image) # Stage 3: Image classification image_regions = self._classify_images(book_image, text_regions) # Stage 4: Generate human-readable interpretation interpretation = self._generate_interpretation(text_regions, image_regions) # Summary statistics summary = { 'total_text_regions': len(text_regions), 'total_image_regions': len(image_regions), 'book_dimensions': f"{book_image.shape[1]}x{book_image.shape[0]}" } return { 'success': True, 'book_extracted': book_extracted, 'text_regions': text_regions, 'image_regions': image_regions, 'summary': summary, 'interpretation': interpretation } except Exception as e: return { 'success': False, 'error': f'Analysis failed: {str(e)}' } def _generate_interpretation(self, text_regions, image_regions): """ Generate human-readable interpretation of analysis results. ML CONCEPT: Post-Processing and Interpretation =============================================== Raw ML outputs need human-readable summaries. This method: 1. Combines all detected text into readable format 2. Describes what types of images were found 3. Provides context for non-technical users This bridges the gap between ML predictions and user understanding. """ interpretation = {} # Text interpretation with smart inference if text_regions: # Combine all text sorted by position (top to bottom) sorted_texts = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x'])) all_text = [r['text'] for r in sorted_texts] # High confidence text only (> 70%) high_conf_text = [r['text'] for r in sorted_texts if r['confidence'] > 0.7] interpretation['full_text'] = ' '.join(all_text) interpretation['high_confidence_text'] = ' '.join(high_conf_text) interpretation['word_count'] = len(all_text) # Smart inference: Infer book title, author, publisher interpretation.update(self._infer_book_metadata(text_regions)) else: interpretation['full_text'] = "No text detected" interpretation['high_confidence_text'] = "" interpretation['word_count'] = 0 interpretation['inferred_title'] = None interpretation['inferred_authors'] = [] interpretation['inferred_publisher'] = None interpretation['other_text'] = [] # Image interpretation if image_regions: image_descriptions = [] for img in image_regions: location = img.get('location', 'unknown') confidence = img['confidence'] * 100 class_name = img['pytorch_class'] image_descriptions.append({ 'location': location.replace('_', ' ').title(), 'classification': class_name, 'confidence': f"{confidence:.1f}%", 'description': f"Found visual element in {location.replace('_', ' ')} area (classified as {class_name} with {confidence:.1f}% confidence)" }) interpretation['images'] = image_descriptions interpretation['image_summary'] = f"Detected {len(image_regions)} visual elements/illustrations on the cover" else: interpretation['images'] = [] interpretation['image_summary'] = "No distinct image regions detected (cover may be text-only or require closer inspection)" # Overall interpretation if text_regions and image_regions: interpretation['cover_type'] = "Mixed - Contains both text and visual elements" elif text_regions and not image_regions: interpretation['cover_type'] = "Text-heavy - Primarily text-based design" elif image_regions and not text_regions: interpretation['cover_type'] = "Visual-heavy - Primarily image-based design" else: interpretation['cover_type'] = "Unknown - Analysis incomplete" return interpretation def _infer_book_metadata(self, text_regions): """ Infer book title, author, and publisher from detected text. ML CONCEPT: Heuristic-Based Inference ====================================== Uses simple rules to guess book metadata: - Title: Largest text, usually at top - Author: Often capitalized names, medium-large text - Publisher: Smaller text, often at bottom - Reviews/Quotes: Text with quotation marks This is NOT ML - it's rule-based heuristics! For better accuracy, would use NER (Named Entity Recognition). """ import re sorted_regions = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x'])) # Categorize by size and position large_text = [r for r in text_regions if r['bbox']['height'] > 50] # Very large (title candidates) medium_text = [r for r in text_regions if 30 < r['bbox']['height'] <= 50] # Medium (author/subtitle) small_text = [r for r in text_regions if r['bbox']['height'] <= 30] # Small (publisher, quotes) # Sort by vertical position large_text.sort(key=lambda r: r['bbox']['y']) medium_text.sort(key=lambda r: r['bbox']['y']) small_text.sort(key=lambda r: r['bbox']['y']) result = {} # Infer Title: Largest text at top if large_text: title_parts = [r['text'] for r in large_text[:2]] # Top 2 largest result['inferred_title'] = ' '.join(title_parts) else: result['inferred_title'] = None # Infer Authors: Look for capitalized names authors = [] for region in medium_text + large_text: text = region['text'] # Check if mostly uppercase (likely author name) if text.isupper() and len(text) > 3: # Avoid common words like "THE", "OF" if text not in ['THE', 'OF', 'AND', 'IN', 'A', 'AN']: authors.append(text.title()) # Convert to title case # Also check for name patterns (First Last) for region in text_regions: text = region['text'] # Pattern: Two or more capitalized words words = text.split() if len(words) >= 2 and all(w[0].isupper() if w else False for w in words): if text not in authors and len(text) > 5: authors.append(text) result['inferred_authors'] = list(set(authors))[:3] # Dedupe, max 3 # Infer Publisher: Small text at bottom bottom_small = [r for r in small_text if r['bbox']['y'] > sum([reg['bbox']['y'] for reg in text_regions]) / len(text_regions)] publisher_candidates = [] for region in bottom_small: text = region['text'] # Skip quotes, numbers, common words if not any(char in text for char in ['"', "'", '«', '»']) and not text.isdigit(): if len(text) > 3: publisher_candidates.append(text) result['inferred_publisher'] = publisher_candidates[0] if publisher_candidates else None # Collect other meaningful text (reviews, quotes, etc.) other_text = [] for region in text_regions: text = region['text'] # Text with quotes (likely reviews) if any(char in text for char in ['"', "'", '«', '»', 'Compelling', 'Fascinating']): other_text.append(text) result['other_text'] = other_text[:5] # Max 5 items return result def _find_book(self, image): """ Stage 1: Find book using edge detection with fallback. ML CONCEPT: Classical Computer Vision with Robustness ====================================================== Uses hand-crafted algorithms (not ML): - Canny edge detection with multiple thresholds - Contour finding - Shape analysis (aspect ratio, area) - Fallback strategy for real-world photos IMPROVEMENT: Added fallback for books in complex scenes (e.g., book on desk with other objects) """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray, (5, 5), 0) # Try multiple edge detection thresholds edge_params = [ (50, 150), # Original (30, 100), # More sensitive (75, 200), # Less sensitive ] for low_thresh, high_thresh in edge_params: edged = cv2.Canny(blurred, low_thresh, high_thresh) contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = imutils.grab_contours(contours) contours = sorted(contours, key=cv2.contourArea, reverse=True) # First pass: Strict criteria (ideal conditions) for contour in contours[:15]: peri = cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, 0.02 * peri, True) area = cv2.contourArea(contour) x, y, w, h = cv2.boundingRect(contour) aspect_ratio = float(w) / h if h > 0 else 0 image_area = image.shape[0] * image.shape[1] area_percentage = (area / image_area) * 100 # Strict criteria: Clean book photos if (area_percentage > 10 and 0.4 < aspect_ratio < 1.2 and len(approx) >= 4): return contour # Second pass: Relaxed criteria (real-world photos) for contour in contours[:20]: peri = cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, 0.02 * peri, True) area = cv2.contourArea(contour) x, y, w, h = cv2.boundingRect(contour) aspect_ratio = float(w) / h if h > 0 else 0 image_area = image.shape[0] * image.shape[1] area_percentage = (area / image_area) * 100 # Relaxed criteria: Books in complex scenes # - Lower area threshold (5% instead of 10%) # - Wider aspect ratio range (0.3 to 1.5) # - Require rectangular shape (4 corners) if (area_percentage > 5 and 0.3 < aspect_ratio < 1.5 and len(approx) >= 4 and area > 5000): return contour # Fallback: Use largest rectangular contour with reasonable size # This handles cases where book edges aren't perfect edged = cv2.Canny(blurred, 30, 100) contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = imutils.grab_contours(contours) contours = sorted(contours, key=cv2.contourArea, reverse=True) image_area = image.shape[0] * image.shape[1] for contour in contours[:25]: peri = cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, 0.02 * peri, True) area = cv2.contourArea(contour) area_percentage = (area / image_area) * 100 # Absolute minimum: Any rectangular shape with meaningful size # - Must be at least 3% of image (not tiny labels/logos) # - Must be at least 20,000 pixels (prevents tiny regions) if len(approx) >= 4 and area > 20000 and area_percentage > 3: x, y, w, h = cv2.boundingRect(contour) aspect_ratio = float(w) / h if h > 0 else 0 # Basic book-like shape # - Minimum width/height to avoid tiny regions if 0.3 < aspect_ratio < 1.8 and w > 100 and h > 100: return contour return None def _extract_book(self, image, contour): """Extract book region with perspective correction.""" peri = cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, 0.02 * peri, True) if len(approx) == 4: # Apply perspective transform book_region = self._four_point_transform(image, approx.reshape(4, 2)) else: # Use bounding box x, y, w, h = cv2.boundingRect(contour) book_region = image[y:y+h, x:x+w] return book_region def _four_point_transform(self, image, pts): """Perspective transform helper.""" rect = self._order_points(pts) (tl, tr, br, bl) = rect widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2)) widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2)) maxWidth = max(int(widthA), int(widthB)) heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2)) heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2)) maxHeight = max(int(heightA), int(heightB)) dst = np.array([ [0, 0], [maxWidth - 1, 0], [maxWidth - 1, maxHeight - 1], [0, maxHeight - 1] ], dtype="float32") M = cv2.getPerspectiveTransform(rect, dst) warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight)) return warped def _order_points(self, pts): """Order points clockwise.""" rect = np.zeros((4, 2), dtype="float32") s = pts.sum(axis=1) rect[0] = pts[np.argmin(s)] rect[2] = pts[np.argmax(s)] diff = np.diff(pts, axis=1) rect[1] = pts[np.argmin(diff)] rect[3] = pts[np.argmax(diff)] return rect def _detect_text(self, book_image): """ Stage 2: Text detection and recognition using EasyOCR. ML CONCEPT: End-to-End Deep Learning ===================================== EasyOCR uses TWO neural networks: 1. CRAFT: Detects WHERE text is (bounding boxes) 2. Recognition Net: Reads WHAT text says (OCR) Both are pre-trained, no training needed! """ results = self.ocr_reader.readtext(book_image, detail=1) text_regions = [] for idx, (bbox, text, confidence) in enumerate(results): bbox_array = np.array(bbox, dtype=np.int32) x = int(np.min(bbox_array[:, 0])) y = int(np.min(bbox_array[:, 1])) w = int(np.max(bbox_array[:, 0]) - x) h = int(np.max(bbox_array[:, 1]) - y) text_regions.append({ 'id': idx + 1, 'text': text, 'bbox': {'x': x, 'y': y, 'width': w, 'height': h}, 'confidence': float(confidence), 'type': 'text' }) return text_regions def _classify_images(self, book_image, text_regions): """ Stage 3: Image classification using PyTorch. ML CONCEPT: Transfer Learning (Zero-shot) + Multi-Region Sampling ================================================================== Strategy: Sample multiple regions across the book cover - Center region (main illustration) - Top corners (logos, badges, icons) - Bottom corners (publisher logos, barcodes) IMPROVEMENT: No longer skips regions with text overlay! Books often have illustrations WITH text on them. """ height, width = book_image.shape[:2] # Define regions to sample (multiple locations) sample_regions = [ # Center - main illustration area { 'name': 'center', 'x1': max(0, width // 2 - 150), 'y1': max(0, height // 2 - 150), 'x2': min(width, width // 2 + 150), 'y2': min(height, height // 2 + 150) }, # Top-left corner { 'name': 'top_left', 'x1': 10, 'y1': 10, 'x2': min(width, 150), 'y2': min(height, 150) }, # Top-right corner { 'name': 'top_right', 'x1': max(0, width - 150), 'y1': 10, 'x2': width - 10, 'y2': min(height, 150) }, # Bottom-left corner { 'name': 'bottom_left', 'x1': 10, 'y1': max(0, height - 150), 'x2': min(width, 150), 'y2': height - 10 }, # Bottom-right corner { 'name': 'bottom_right', 'x1': max(0, width - 150), 'y1': max(0, height - 150), 'x2': width - 10, 'y2': height - 10 } ] image_regions = [] region_id = len(text_regions) + 1 for sample in sample_regions: x1, y1 = sample['x1'], sample['y1'] x2, y2 = sample['x2'], sample['y2'] # Skip if region is too small if x2 - x1 < 50 or y2 - y1 < 50: continue # Extract region region_crop = book_image[y1:y2, x1:x2] # Check if region has enough visual content (not just solid color) gray = cv2.cvtColor(region_crop, cv2.COLOR_BGR2GRAY) std_dev = np.std(gray) # Only classify if there's visual complexity (not blank/solid color) if std_dev > 15: # Threshold for visual complexity result = self._classify_single_image(region_crop) image_regions.append({ 'id': region_id, 'location': sample['name'], 'bbox': {'x': x1, 'y': y1, 'width': x2-x1, 'height': y2-y1}, 'pytorch_class': result['class_name'], 'confidence': result['confidence'], 'top_5': result['top_5'], 'type': 'image' }) region_id += 1 return image_regions def _classify_single_image(self, opencv_image): """ Classify single image region with PyTorch. ML CONCEPT: Inference Pipeline =============================== 1. Convert BGR → RGB → PIL 2. Resize, crop, normalize (preprocessing) 3. Convert to tensor, add batch dim 4. Forward pass through ResNet18 5. Softmax to get probabilities 6. Return top predictions """ # Convert OpenCV (BGR) → PIL (RGB) rgb = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(rgb) # Preprocess tensor = self.transform(pil_image) tensor = tensor.unsqueeze(0) # Add batch dimension # Inference with torch.no_grad(): outputs = self.pytorch_model(tensor) probs = torch.nn.functional.softmax(outputs[0], dim=0) # Get top 5 top_probs, top_indices = torch.topk(probs, 5) top_class_id = top_indices[0].item() top_confidence = top_probs[0].item() # Get actual ImageNet class name if self.imagenet_classes and top_class_id < len(self.imagenet_classes): class_name = self.imagenet_classes[top_class_id] else: class_name = f"element_{top_class_id}" top_5 = [ { 'class_id': idx.item(), 'class_name': self.imagenet_classes[idx.item()] if self.imagenet_classes and idx.item() < len(self.imagenet_classes) else f"class_{idx.item()}", 'confidence': prob.item() } for idx, prob in zip(top_indices, top_probs) ] return { 'class_id': top_class_id, 'class_name': class_name, 'confidence': float(top_confidence), 'top_5': top_5 }