book-cover-analyzer / analyzer /book_analyzer.py
singhanshuman's picture
Upload 6 files
595b88e verified
"""
Book Cover Analyzer - Web-Ready Version
Refactored V4 for Flask integration
"""
import cv2
import numpy as np
import easyocr
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import warnings
import imutils
warnings.filterwarnings('ignore')
class BookCoverAnalyzer:
"""
Complete book cover analysis pipeline.
ML CONCEPT: Stateful Service
============================
This class is designed for web deployment:
- Initialize once (loads models into memory)
- Process many images (reuse loaded models)
- Thread-safe design (can handle multiple requests)
Benefits:
- Fast: Models loaded once, not per request
- Memory efficient: Shared model weights
- Production-ready: Error handling included
"""
def __init__(self, verbose=False):
"""
Initialize analyzer with all ML models.
ML CONCEPT: Model Loading Strategy
===================================
We load ALL models at startup (not lazy):
- EasyOCR: Text detection + recognition
- PyTorch ResNet18: Image classification
Why load at startup?
- Web server starts once
- First request doesn't have cold-start delay
- Consistent response times
"""
self.verbose = verbose
if self.verbose:
print("[INFO] Initializing Book Cover Analyzer...")
# Load EasyOCR
if self.verbose:
print("[INFO] Loading EasyOCR...")
self.ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
# Load PyTorch model
if self.verbose:
print("[INFO] Loading PyTorch ResNet18...")
self.pytorch_model = models.resnet18(weights='DEFAULT')
self.pytorch_model.eval()
# Create preprocessing pipeline
self.transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
# Load ImageNet class labels
import os
labels_path = os.path.join(os.path.dirname(__file__), 'imagenet_classes.txt')
try:
with open(labels_path, 'r') as f:
self.imagenet_classes = [line.strip() for line in f.readlines()]
except FileNotFoundError:
if self.verbose:
print("[WARNING] ImageNet classes file not found, using generic labels")
self.imagenet_classes = None
if self.verbose:
print("[SUCCESS] All models loaded!")
def analyze(self, image_path):
"""
Complete analysis pipeline.
Args:
image_path: Path to book cover image
Returns:
dict: {
'success': bool,
'book_extracted': bool,
'text_regions': [...],
'image_regions': [...],
'summary': {...},
'error': str (if failed)
}
ML CONCEPT: Pipeline Architecture
==================================
Stage 1: OpenCV (Classical CV) - Find book
Stage 2: EasyOCR (DL) - Detect & read text
Stage 3: PyTorch (DL) - Classify images
Each stage can fail independently.
We return partial results + error status.
"""
try:
# Load image
image = cv2.imread(image_path)
if image is None:
return {
'success': False,
'error': 'Could not load image'
}
# Stage 1: Find and extract book
book_contour = self._find_book(image)
if book_contour is None:
# Fallback: If book detection fails, process whole image
# This handles complex scenes where edge detection struggles
if self.verbose:
print("[INFO] Book detection failed, processing entire image")
book_image = image
book_extracted = False
else:
# Successfully detected book boundary
book_image = self._extract_book(image, book_contour)
book_extracted = True
# Stage 2: Text detection and recognition
text_regions = self._detect_text(book_image)
# Stage 3: Image classification
image_regions = self._classify_images(book_image, text_regions)
# Stage 4: Generate human-readable interpretation
interpretation = self._generate_interpretation(text_regions, image_regions)
# Summary statistics
summary = {
'total_text_regions': len(text_regions),
'total_image_regions': len(image_regions),
'book_dimensions': f"{book_image.shape[1]}x{book_image.shape[0]}"
}
return {
'success': True,
'book_extracted': book_extracted,
'text_regions': text_regions,
'image_regions': image_regions,
'summary': summary,
'interpretation': interpretation
}
except Exception as e:
return {
'success': False,
'error': f'Analysis failed: {str(e)}'
}
def _generate_interpretation(self, text_regions, image_regions):
"""
Generate human-readable interpretation of analysis results.
ML CONCEPT: Post-Processing and Interpretation
===============================================
Raw ML outputs need human-readable summaries.
This method:
1. Combines all detected text into readable format
2. Describes what types of images were found
3. Provides context for non-technical users
This bridges the gap between ML predictions and user understanding.
"""
interpretation = {}
# Text interpretation with smart inference
if text_regions:
# Combine all text sorted by position (top to bottom)
sorted_texts = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x']))
all_text = [r['text'] for r in sorted_texts]
# High confidence text only (> 70%)
high_conf_text = [r['text'] for r in sorted_texts if r['confidence'] > 0.7]
interpretation['full_text'] = ' '.join(all_text)
interpretation['high_confidence_text'] = ' '.join(high_conf_text)
interpretation['word_count'] = len(all_text)
# Smart inference: Infer book title, author, publisher
interpretation.update(self._infer_book_metadata(text_regions))
else:
interpretation['full_text'] = "No text detected"
interpretation['high_confidence_text'] = ""
interpretation['word_count'] = 0
interpretation['inferred_title'] = None
interpretation['inferred_authors'] = []
interpretation['inferred_publisher'] = None
interpretation['other_text'] = []
# Image interpretation
if image_regions:
image_descriptions = []
for img in image_regions:
location = img.get('location', 'unknown')
confidence = img['confidence'] * 100
class_name = img['pytorch_class']
image_descriptions.append({
'location': location.replace('_', ' ').title(),
'classification': class_name,
'confidence': f"{confidence:.1f}%",
'description': f"Found visual element in {location.replace('_', ' ')} area (classified as {class_name} with {confidence:.1f}% confidence)"
})
interpretation['images'] = image_descriptions
interpretation['image_summary'] = f"Detected {len(image_regions)} visual elements/illustrations on the cover"
else:
interpretation['images'] = []
interpretation['image_summary'] = "No distinct image regions detected (cover may be text-only or require closer inspection)"
# Overall interpretation
if text_regions and image_regions:
interpretation['cover_type'] = "Mixed - Contains both text and visual elements"
elif text_regions and not image_regions:
interpretation['cover_type'] = "Text-heavy - Primarily text-based design"
elif image_regions and not text_regions:
interpretation['cover_type'] = "Visual-heavy - Primarily image-based design"
else:
interpretation['cover_type'] = "Unknown - Analysis incomplete"
return interpretation
def _infer_book_metadata(self, text_regions):
"""
Infer book title, author, and publisher from detected text.
ML CONCEPT: Heuristic-Based Inference
======================================
Uses simple rules to guess book metadata:
- Title: Largest text, usually at top
- Author: Often capitalized names, medium-large text
- Publisher: Smaller text, often at bottom
- Reviews/Quotes: Text with quotation marks
This is NOT ML - it's rule-based heuristics!
For better accuracy, would use NER (Named Entity Recognition).
"""
import re
sorted_regions = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x']))
# Categorize by size and position
large_text = [r for r in text_regions if r['bbox']['height'] > 50] # Very large (title candidates)
medium_text = [r for r in text_regions if 30 < r['bbox']['height'] <= 50] # Medium (author/subtitle)
small_text = [r for r in text_regions if r['bbox']['height'] <= 30] # Small (publisher, quotes)
# Sort by vertical position
large_text.sort(key=lambda r: r['bbox']['y'])
medium_text.sort(key=lambda r: r['bbox']['y'])
small_text.sort(key=lambda r: r['bbox']['y'])
result = {}
# Infer Title: Largest text at top
if large_text:
title_parts = [r['text'] for r in large_text[:2]] # Top 2 largest
result['inferred_title'] = ' '.join(title_parts)
else:
result['inferred_title'] = None
# Infer Authors: Look for capitalized names
authors = []
for region in medium_text + large_text:
text = region['text']
# Check if mostly uppercase (likely author name)
if text.isupper() and len(text) > 3:
# Avoid common words like "THE", "OF"
if text not in ['THE', 'OF', 'AND', 'IN', 'A', 'AN']:
authors.append(text.title()) # Convert to title case
# Also check for name patterns (First Last)
for region in text_regions:
text = region['text']
# Pattern: Two or more capitalized words
words = text.split()
if len(words) >= 2 and all(w[0].isupper() if w else False for w in words):
if text not in authors and len(text) > 5:
authors.append(text)
result['inferred_authors'] = list(set(authors))[:3] # Dedupe, max 3
# Infer Publisher: Small text at bottom
bottom_small = [r for r in small_text if r['bbox']['y'] > sum([reg['bbox']['y'] for reg in text_regions]) / len(text_regions)]
publisher_candidates = []
for region in bottom_small:
text = region['text']
# Skip quotes, numbers, common words
if not any(char in text for char in ['"', "'", '«', '»']) and not text.isdigit():
if len(text) > 3:
publisher_candidates.append(text)
result['inferred_publisher'] = publisher_candidates[0] if publisher_candidates else None
# Collect other meaningful text (reviews, quotes, etc.)
other_text = []
for region in text_regions:
text = region['text']
# Text with quotes (likely reviews)
if any(char in text for char in ['"', "'", '«', '»', 'Compelling', 'Fascinating']):
other_text.append(text)
result['other_text'] = other_text[:5] # Max 5 items
return result
def _find_book(self, image):
"""
Stage 1: Find book using edge detection with fallback.
ML CONCEPT: Classical Computer Vision with Robustness
======================================================
Uses hand-crafted algorithms (not ML):
- Canny edge detection with multiple thresholds
- Contour finding
- Shape analysis (aspect ratio, area)
- Fallback strategy for real-world photos
IMPROVEMENT: Added fallback for books in complex scenes
(e.g., book on desk with other objects)
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# Try multiple edge detection thresholds
edge_params = [
(50, 150), # Original
(30, 100), # More sensitive
(75, 200), # Less sensitive
]
for low_thresh, high_thresh in edge_params:
edged = cv2.Canny(blurred, low_thresh, high_thresh)
contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
# First pass: Strict criteria (ideal conditions)
for contour in contours[:15]:
peri = cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
area = cv2.contourArea(contour)
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = float(w) / h if h > 0 else 0
image_area = image.shape[0] * image.shape[1]
area_percentage = (area / image_area) * 100
# Strict criteria: Clean book photos
if (area_percentage > 10 and 0.4 < aspect_ratio < 1.2
and len(approx) >= 4):
return contour
# Second pass: Relaxed criteria (real-world photos)
for contour in contours[:20]:
peri = cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
area = cv2.contourArea(contour)
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = float(w) / h if h > 0 else 0
image_area = image.shape[0] * image.shape[1]
area_percentage = (area / image_area) * 100
# Relaxed criteria: Books in complex scenes
# - Lower area threshold (5% instead of 10%)
# - Wider aspect ratio range (0.3 to 1.5)
# - Require rectangular shape (4 corners)
if (area_percentage > 5 and 0.3 < aspect_ratio < 1.5
and len(approx) >= 4 and area > 5000):
return contour
# Fallback: Use largest rectangular contour with reasonable size
# This handles cases where book edges aren't perfect
edged = cv2.Canny(blurred, 30, 100)
contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
image_area = image.shape[0] * image.shape[1]
for contour in contours[:25]:
peri = cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
area = cv2.contourArea(contour)
area_percentage = (area / image_area) * 100
# Absolute minimum: Any rectangular shape with meaningful size
# - Must be at least 3% of image (not tiny labels/logos)
# - Must be at least 20,000 pixels (prevents tiny regions)
if len(approx) >= 4 and area > 20000 and area_percentage > 3:
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = float(w) / h if h > 0 else 0
# Basic book-like shape
# - Minimum width/height to avoid tiny regions
if 0.3 < aspect_ratio < 1.8 and w > 100 and h > 100:
return contour
return None
def _extract_book(self, image, contour):
"""Extract book region with perspective correction."""
peri = cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
if len(approx) == 4:
# Apply perspective transform
book_region = self._four_point_transform(image, approx.reshape(4, 2))
else:
# Use bounding box
x, y, w, h = cv2.boundingRect(contour)
book_region = image[y:y+h, x:x+w]
return book_region
def _four_point_transform(self, image, pts):
"""Perspective transform helper."""
rect = self._order_points(pts)
(tl, tr, br, bl) = rect
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))
dst = np.array([
[0, 0],
[maxWidth - 1, 0],
[maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]
], dtype="float32")
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
return warped
def _order_points(self, pts):
"""Order points clockwise."""
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def _detect_text(self, book_image):
"""
Stage 2: Text detection and recognition using EasyOCR.
ML CONCEPT: End-to-End Deep Learning
=====================================
EasyOCR uses TWO neural networks:
1. CRAFT: Detects WHERE text is (bounding boxes)
2. Recognition Net: Reads WHAT text says (OCR)
Both are pre-trained, no training needed!
"""
results = self.ocr_reader.readtext(book_image, detail=1)
text_regions = []
for idx, (bbox, text, confidence) in enumerate(results):
bbox_array = np.array(bbox, dtype=np.int32)
x = int(np.min(bbox_array[:, 0]))
y = int(np.min(bbox_array[:, 1]))
w = int(np.max(bbox_array[:, 0]) - x)
h = int(np.max(bbox_array[:, 1]) - y)
text_regions.append({
'id': idx + 1,
'text': text,
'bbox': {'x': x, 'y': y, 'width': w, 'height': h},
'confidence': float(confidence),
'type': 'text'
})
return text_regions
def _classify_images(self, book_image, text_regions):
"""
Stage 3: Image classification using PyTorch.
ML CONCEPT: Transfer Learning (Zero-shot) + Multi-Region Sampling
==================================================================
Strategy: Sample multiple regions across the book cover
- Center region (main illustration)
- Top corners (logos, badges, icons)
- Bottom corners (publisher logos, barcodes)
IMPROVEMENT: No longer skips regions with text overlay!
Books often have illustrations WITH text on them.
"""
height, width = book_image.shape[:2]
# Define regions to sample (multiple locations)
sample_regions = [
# Center - main illustration area
{
'name': 'center',
'x1': max(0, width // 2 - 150),
'y1': max(0, height // 2 - 150),
'x2': min(width, width // 2 + 150),
'y2': min(height, height // 2 + 150)
},
# Top-left corner
{
'name': 'top_left',
'x1': 10,
'y1': 10,
'x2': min(width, 150),
'y2': min(height, 150)
},
# Top-right corner
{
'name': 'top_right',
'x1': max(0, width - 150),
'y1': 10,
'x2': width - 10,
'y2': min(height, 150)
},
# Bottom-left corner
{
'name': 'bottom_left',
'x1': 10,
'y1': max(0, height - 150),
'x2': min(width, 150),
'y2': height - 10
},
# Bottom-right corner
{
'name': 'bottom_right',
'x1': max(0, width - 150),
'y1': max(0, height - 150),
'x2': width - 10,
'y2': height - 10
}
]
image_regions = []
region_id = len(text_regions) + 1
for sample in sample_regions:
x1, y1 = sample['x1'], sample['y1']
x2, y2 = sample['x2'], sample['y2']
# Skip if region is too small
if x2 - x1 < 50 or y2 - y1 < 50:
continue
# Extract region
region_crop = book_image[y1:y2, x1:x2]
# Check if region has enough visual content (not just solid color)
gray = cv2.cvtColor(region_crop, cv2.COLOR_BGR2GRAY)
std_dev = np.std(gray)
# Only classify if there's visual complexity (not blank/solid color)
if std_dev > 15: # Threshold for visual complexity
result = self._classify_single_image(region_crop)
image_regions.append({
'id': region_id,
'location': sample['name'],
'bbox': {'x': x1, 'y': y1, 'width': x2-x1, 'height': y2-y1},
'pytorch_class': result['class_name'],
'confidence': result['confidence'],
'top_5': result['top_5'],
'type': 'image'
})
region_id += 1
return image_regions
def _classify_single_image(self, opencv_image):
"""
Classify single image region with PyTorch.
ML CONCEPT: Inference Pipeline
===============================
1. Convert BGR → RGB → PIL
2. Resize, crop, normalize (preprocessing)
3. Convert to tensor, add batch dim
4. Forward pass through ResNet18
5. Softmax to get probabilities
6. Return top predictions
"""
# Convert OpenCV (BGR) → PIL (RGB)
rgb = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb)
# Preprocess
tensor = self.transform(pil_image)
tensor = tensor.unsqueeze(0) # Add batch dimension
# Inference
with torch.no_grad():
outputs = self.pytorch_model(tensor)
probs = torch.nn.functional.softmax(outputs[0], dim=0)
# Get top 5
top_probs, top_indices = torch.topk(probs, 5)
top_class_id = top_indices[0].item()
top_confidence = top_probs[0].item()
# Get actual ImageNet class name
if self.imagenet_classes and top_class_id < len(self.imagenet_classes):
class_name = self.imagenet_classes[top_class_id]
else:
class_name = f"element_{top_class_id}"
top_5 = [
{
'class_id': idx.item(),
'class_name': self.imagenet_classes[idx.item()] if self.imagenet_classes and idx.item() < len(self.imagenet_classes) else f"class_{idx.item()}",
'confidence': prob.item()
}
for idx, prob in zip(top_indices, top_probs)
]
return {
'class_id': top_class_id,
'class_name': class_name,
'confidence': float(top_confidence),
'top_5': top_5
}