Spaces:

singhanshuman
/

book-cover-analyzer

Sleeping

App Files Files Community

book-cover-analyzer / analyzer /book_analyzer.py

singhanshuman

Upload 6 files

595b88e verified 7 months ago

raw

history blame contribute delete

24.8 kB

	"""
	Book Cover Analyzer - Web-Ready Version
	Refactored V4 for Flask integration
	"""

	import cv2
	import numpy as np
	import easyocr
	import torch
	import torchvision.models as models
	import torchvision.transforms as transforms
	from PIL import Image
	import warnings
	import imutils

	warnings.filterwarnings('ignore')


	class BookCoverAnalyzer:
	"""
	Complete book cover analysis pipeline.

	ML CONCEPT: Stateful Service
	============================
	This class is designed for web deployment:
	- Initialize once (loads models into memory)
	- Process many images (reuse loaded models)
	- Thread-safe design (can handle multiple requests)

	Benefits:
	- Fast: Models loaded once, not per request
	- Memory efficient: Shared model weights
	- Production-ready: Error handling included
	"""

	def __init__(self, verbose=False):
	"""
	Initialize analyzer with all ML models.

	ML CONCEPT: Model Loading Strategy
	===================================
	We load ALL models at startup (not lazy):
	- EasyOCR: Text detection + recognition
	- PyTorch ResNet18: Image classification

	Why load at startup?
	- Web server starts once
	- First request doesn't have cold-start delay
	- Consistent response times
	"""
	self.verbose = verbose

	if self.verbose:
	print("[INFO] Initializing Book Cover Analyzer...")

	# Load EasyOCR
	if self.verbose:
	print("[INFO] Loading EasyOCR...")
	self.ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)

	# Load PyTorch model
	if self.verbose:
	print("[INFO] Loading PyTorch ResNet18...")
	self.pytorch_model = models.resnet18(weights='DEFAULT')
	self.pytorch_model.eval()

	# Create preprocessing pipeline
	self.transform = transforms.Compose([
	transforms.Resize(256),
	transforms.CenterCrop(224),
	transforms.ToTensor(),
	transforms.Normalize(
	mean=[0.485, 0.456, 0.406],
	std=[0.229, 0.224, 0.225]
	)
	])

	# Load ImageNet class labels
	import os
	labels_path = os.path.join(os.path.dirname(__file__), 'imagenet_classes.txt')
	try:
	with open(labels_path, 'r') as f:
	self.imagenet_classes = [line.strip() for line in f.readlines()]
	except FileNotFoundError:
	if self.verbose:
	print("[WARNING] ImageNet classes file not found, using generic labels")
	self.imagenet_classes = None

	if self.verbose:
	print("[SUCCESS] All models loaded!")

	def analyze(self, image_path):
	"""
	Complete analysis pipeline.

	Args:
	image_path: Path to book cover image

	Returns:
	dict: {
	'success': bool,
	'book_extracted': bool,
	'text_regions': [...],
	'image_regions': [...],
	'summary': {...},
	'error': str (if failed)
	}

	ML CONCEPT: Pipeline Architecture
	==================================
	Stage 1: OpenCV (Classical CV) - Find book
	Stage 2: EasyOCR (DL) - Detect & read text
	Stage 3: PyTorch (DL) - Classify images

	Each stage can fail independently.
	We return partial results + error status.
	"""
	try:
	# Load image
	image = cv2.imread(image_path)
	if image is None:
	return {
	'success': False,
	'error': 'Could not load image'
	}

	# Stage 1: Find and extract book
	book_contour = self._find_book(image)

	if book_contour is None:
	# Fallback: If book detection fails, process whole image
	# This handles complex scenes where edge detection struggles
	if self.verbose:
	print("[INFO] Book detection failed, processing entire image")
	book_image = image
	book_extracted = False
	else:
	# Successfully detected book boundary
	book_image = self._extract_book(image, book_contour)
	book_extracted = True

	# Stage 2: Text detection and recognition
	text_regions = self._detect_text(book_image)

	# Stage 3: Image classification
	image_regions = self._classify_images(book_image, text_regions)

	# Stage 4: Generate human-readable interpretation
	interpretation = self._generate_interpretation(text_regions, image_regions)

	# Summary statistics
	summary = {
	'total_text_regions': len(text_regions),
	'total_image_regions': len(image_regions),
	'book_dimensions': f"{book_image.shape[1]}x{book_image.shape[0]}"
	}

	return {
	'success': True,
	'book_extracted': book_extracted,
	'text_regions': text_regions,
	'image_regions': image_regions,
	'summary': summary,
	'interpretation': interpretation
	}

	except Exception as e:
	return {
	'success': False,
	'error': f'Analysis failed: {str(e)}'
	}

	def _generate_interpretation(self, text_regions, image_regions):
	"""
	Generate human-readable interpretation of analysis results.

	ML CONCEPT: Post-Processing and Interpretation
	===============================================
	Raw ML outputs need human-readable summaries.
	This method:
	1. Combines all detected text into readable format
	2. Describes what types of images were found
	3. Provides context for non-technical users

	This bridges the gap between ML predictions and user understanding.
	"""
	interpretation = {}

	# Text interpretation with smart inference
	if text_regions:
	# Combine all text sorted by position (top to bottom)
	sorted_texts = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x']))
	all_text = [r['text'] for r in sorted_texts]

	# High confidence text only (> 70%)
	high_conf_text = [r['text'] for r in sorted_texts if r['confidence'] > 0.7]

	interpretation['full_text'] = ' '.join(all_text)
	interpretation['high_confidence_text'] = ' '.join(high_conf_text)
	interpretation['word_count'] = len(all_text)

	# Smart inference: Infer book title, author, publisher
	interpretation.update(self._infer_book_metadata(text_regions))
	else:
	interpretation['full_text'] = "No text detected"
	interpretation['high_confidence_text'] = ""
	interpretation['word_count'] = 0
	interpretation['inferred_title'] = None
	interpretation['inferred_authors'] = []
	interpretation['inferred_publisher'] = None
	interpretation['other_text'] = []

	# Image interpretation
	if image_regions:
	image_descriptions = []
	for img in image_regions:
	location = img.get('location', 'unknown')
	confidence = img['confidence'] * 100
	class_name = img['pytorch_class']

	image_descriptions.append({
	'location': location.replace('_', ' ').title(),
	'classification': class_name,
	'confidence': f"{confidence:.1f}%",
	'description': f"Found visual element in {location.replace('_', ' ')} area (classified as {class_name} with {confidence:.1f}% confidence)"
	})

	interpretation['images'] = image_descriptions
	interpretation['image_summary'] = f"Detected {len(image_regions)} visual elements/illustrations on the cover"
	else:
	interpretation['images'] = []
	interpretation['image_summary'] = "No distinct image regions detected (cover may be text-only or require closer inspection)"

	# Overall interpretation
	if text_regions and image_regions:
	interpretation['cover_type'] = "Mixed - Contains both text and visual elements"
	elif text_regions and not image_regions:
	interpretation['cover_type'] = "Text-heavy - Primarily text-based design"
	elif image_regions and not text_regions:
	interpretation['cover_type'] = "Visual-heavy - Primarily image-based design"
	else:
	interpretation['cover_type'] = "Unknown - Analysis incomplete"

	return interpretation

	def _infer_book_metadata(self, text_regions):
	"""
	Infer book title, author, and publisher from detected text.

	ML CONCEPT: Heuristic-Based Inference
	======================================
	Uses simple rules to guess book metadata:
	- Title: Largest text, usually at top
	- Author: Often capitalized names, medium-large text
	- Publisher: Smaller text, often at bottom
	- Reviews/Quotes: Text with quotation marks

	This is NOT ML - it's rule-based heuristics!
	For better accuracy, would use NER (Named Entity Recognition).
	"""
	import re

	sorted_regions = sorted(text_regions, key=lambda r: (r['bbox']['y'], r['bbox']['x']))

	# Categorize by size and position
	large_text = [r for r in text_regions if r['bbox']['height'] > 50] # Very large (title candidates)
	medium_text = [r for r in text_regions if 30 < r['bbox']['height'] <= 50] # Medium (author/subtitle)
	small_text = [r for r in text_regions if r['bbox']['height'] <= 30] # Small (publisher, quotes)

	# Sort by vertical position
	large_text.sort(key=lambda r: r['bbox']['y'])
	medium_text.sort(key=lambda r: r['bbox']['y'])
	small_text.sort(key=lambda r: r['bbox']['y'])

	result = {}

	# Infer Title: Largest text at top
	if large_text:
	title_parts = [r['text'] for r in large_text[:2]] # Top 2 largest
	result['inferred_title'] = ' '.join(title_parts)
	else:
	result['inferred_title'] = None

	# Infer Authors: Look for capitalized names
	authors = []
	for region in medium_text + large_text:
	text = region['text']
	# Check if mostly uppercase (likely author name)
	if text.isupper() and len(text) > 3:
	# Avoid common words like "THE", "OF"
	if text not in ['THE', 'OF', 'AND', 'IN', 'A', 'AN']:
	authors.append(text.title()) # Convert to title case

	# Also check for name patterns (First Last)
	for region in text_regions:
	text = region['text']
	# Pattern: Two or more capitalized words
	words = text.split()
	if len(words) >= 2 and all(w[0].isupper() if w else False for w in words):
	if text not in authors and len(text) > 5:
	authors.append(text)

	result['inferred_authors'] = list(set(authors))[:3] # Dedupe, max 3

	# Infer Publisher: Small text at bottom
	bottom_small = [r for r in small_text if r['bbox']['y'] > sum([reg['bbox']['y'] for reg in text_regions]) / len(text_regions)]
	publisher_candidates = []
	for region in bottom_small:
	text = region['text']
	# Skip quotes, numbers, common words
	if not any(char in text for char in ['"', "'", '«', '»']) and not text.isdigit():
	if len(text) > 3:
	publisher_candidates.append(text)

	result['inferred_publisher'] = publisher_candidates[0] if publisher_candidates else None

	# Collect other meaningful text (reviews, quotes, etc.)
	other_text = []
	for region in text_regions:
	text = region['text']
	# Text with quotes (likely reviews)
	if any(char in text for char in ['"', "'", '«', '»', 'Compelling', 'Fascinating']):
	other_text.append(text)

	result['other_text'] = other_text[:5] # Max 5 items

	return result

	def _find_book(self, image):
	"""
	Stage 1: Find book using edge detection with fallback.

	ML CONCEPT: Classical Computer Vision with Robustness
	======================================================
	Uses hand-crafted algorithms (not ML):
	- Canny edge detection with multiple thresholds
	- Contour finding
	- Shape analysis (aspect ratio, area)
	- Fallback strategy for real-world photos

	IMPROVEMENT: Added fallback for books in complex scenes
	(e.g., book on desk with other objects)
	"""
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	blurred = cv2.GaussianBlur(gray, (5, 5), 0)

	# Try multiple edge detection thresholds
	edge_params = [
	(50, 150), # Original
	(30, 100), # More sensitive
	(75, 200), # Less sensitive
	]

	for low_thresh, high_thresh in edge_params:
	edged = cv2.Canny(blurred, low_thresh, high_thresh)

	contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
	cv2.CHAIN_APPROX_SIMPLE)
	contours = imutils.grab_contours(contours)
	contours = sorted(contours, key=cv2.contourArea, reverse=True)

	# First pass: Strict criteria (ideal conditions)
	for contour in contours[:15]:
	peri = cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
	area = cv2.contourArea(contour)
	x, y, w, h = cv2.boundingRect(contour)
	aspect_ratio = float(w) / h if h > 0 else 0

	image_area = image.shape[0] * image.shape[1]
	area_percentage = (area / image_area) * 100

	# Strict criteria: Clean book photos
	if (area_percentage > 10 and 0.4 < aspect_ratio < 1.2
	and len(approx) >= 4):
	return contour

	# Second pass: Relaxed criteria (real-world photos)
	for contour in contours[:20]:
	peri = cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
	area = cv2.contourArea(contour)
	x, y, w, h = cv2.boundingRect(contour)
	aspect_ratio = float(w) / h if h > 0 else 0

	image_area = image.shape[0] * image.shape[1]
	area_percentage = (area / image_area) * 100

	# Relaxed criteria: Books in complex scenes
	# - Lower area threshold (5% instead of 10%)
	# - Wider aspect ratio range (0.3 to 1.5)
	# - Require rectangular shape (4 corners)
	if (area_percentage > 5 and 0.3 < aspect_ratio < 1.5
	and len(approx) >= 4 and area > 5000):
	return contour

	# Fallback: Use largest rectangular contour with reasonable size
	# This handles cases where book edges aren't perfect
	edged = cv2.Canny(blurred, 30, 100)
	contours = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
	cv2.CHAIN_APPROX_SIMPLE)
	contours = imutils.grab_contours(contours)
	contours = sorted(contours, key=cv2.contourArea, reverse=True)

	image_area = image.shape[0] * image.shape[1]

	for contour in contours[:25]:
	peri = cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
	area = cv2.contourArea(contour)
	area_percentage = (area / image_area) * 100

	# Absolute minimum: Any rectangular shape with meaningful size
	# - Must be at least 3% of image (not tiny labels/logos)
	# - Must be at least 20,000 pixels (prevents tiny regions)
	if len(approx) >= 4 and area > 20000 and area_percentage > 3:
	x, y, w, h = cv2.boundingRect(contour)
	aspect_ratio = float(w) / h if h > 0 else 0

	# Basic book-like shape
	# - Minimum width/height to avoid tiny regions
	if 0.3 < aspect_ratio < 1.8 and w > 100 and h > 100:
	return contour

	return None

	def _extract_book(self, image, contour):
	"""Extract book region with perspective correction."""
	peri = cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, 0.02 * peri, True)

	if len(approx) == 4:
	# Apply perspective transform
	book_region = self._four_point_transform(image, approx.reshape(4, 2))
	else:
	# Use bounding box
	x, y, w, h = cv2.boundingRect(contour)
	book_region = image[y:y+h, x:x+w]

	return book_region

	def _four_point_transform(self, image, pts):
	"""Perspective transform helper."""
	rect = self._order_points(pts)
	(tl, tr, br, bl) = rect

	widthA = np.sqrt(((br[0] - bl[0]) 2) + ((br[1] - bl[1]) 2))
	widthB = np.sqrt(((tr[0] - tl[0]) 2) + ((tr[1] - tl[1]) 2))
	maxWidth = max(int(widthA), int(widthB))

	heightA = np.sqrt(((tr[0] - br[0]) 2) + ((tr[1] - br[1]) 2))
	heightB = np.sqrt(((tl[0] - bl[0]) 2) + ((tl[1] - bl[1]) 2))
	maxHeight = max(int(heightA), int(heightB))

	dst = np.array([
	[0, 0],
	[maxWidth - 1, 0],
	[maxWidth - 1, maxHeight - 1],
	[0, maxHeight - 1]
	], dtype="float32")

	M = cv2.getPerspectiveTransform(rect, dst)
	warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

	return warped

	def _order_points(self, pts):
	"""Order points clockwise."""
	rect = np.zeros((4, 2), dtype="float32")
	s = pts.sum(axis=1)
	rect[0] = pts[np.argmin(s)]
	rect[2] = pts[np.argmax(s)]
	diff = np.diff(pts, axis=1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]
	return rect

	def _detect_text(self, book_image):
	"""
	Stage 2: Text detection and recognition using EasyOCR.

	ML CONCEPT: End-to-End Deep Learning
	=====================================
	EasyOCR uses TWO neural networks:
	1. CRAFT: Detects WHERE text is (bounding boxes)
	2. Recognition Net: Reads WHAT text says (OCR)

	Both are pre-trained, no training needed!
	"""
	results = self.ocr_reader.readtext(book_image, detail=1)

	text_regions = []
	for idx, (bbox, text, confidence) in enumerate(results):
	bbox_array = np.array(bbox, dtype=np.int32)
	x = int(np.min(bbox_array[:, 0]))
	y = int(np.min(bbox_array[:, 1]))
	w = int(np.max(bbox_array[:, 0]) - x)
	h = int(np.max(bbox_array[:, 1]) - y)

	text_regions.append({
	'id': idx + 1,
	'text': text,
	'bbox': {'x': x, 'y': y, 'width': w, 'height': h},
	'confidence': float(confidence),
	'type': 'text'
	})

	return text_regions

	def _classify_images(self, book_image, text_regions):
	"""
	Stage 3: Image classification using PyTorch.

	ML CONCEPT: Transfer Learning (Zero-shot) + Multi-Region Sampling
	==================================================================
	Strategy: Sample multiple regions across the book cover
	- Center region (main illustration)
	- Top corners (logos, badges, icons)
	- Bottom corners (publisher logos, barcodes)

	IMPROVEMENT: No longer skips regions with text overlay!
	Books often have illustrations WITH text on them.
	"""
	height, width = book_image.shape[:2]

	# Define regions to sample (multiple locations)
	sample_regions = [
	# Center - main illustration area
	{
	'name': 'center',
	'x1': max(0, width // 2 - 150),
	'y1': max(0, height // 2 - 150),
	'x2': min(width, width // 2 + 150),
	'y2': min(height, height // 2 + 150)
	},
	# Top-left corner
	{
	'name': 'top_left',
	'x1': 10,
	'y1': 10,
	'x2': min(width, 150),
	'y2': min(height, 150)
	},
	# Top-right corner
	{
	'name': 'top_right',
	'x1': max(0, width - 150),
	'y1': 10,
	'x2': width - 10,
	'y2': min(height, 150)
	},
	# Bottom-left corner
	{
	'name': 'bottom_left',
	'x1': 10,
	'y1': max(0, height - 150),
	'x2': min(width, 150),
	'y2': height - 10
	},
	# Bottom-right corner
	{
	'name': 'bottom_right',
	'x1': max(0, width - 150),
	'y1': max(0, height - 150),
	'x2': width - 10,
	'y2': height - 10
	}
	]

	image_regions = []
	region_id = len(text_regions) + 1

	for sample in sample_regions:
	x1, y1 = sample['x1'], sample['y1']
	x2, y2 = sample['x2'], sample['y2']

	# Skip if region is too small
	if x2 - x1 < 50 or y2 - y1 < 50:
	continue

	# Extract region
	region_crop = book_image[y1:y2, x1:x2]

	# Check if region has enough visual content (not just solid color)
	gray = cv2.cvtColor(region_crop, cv2.COLOR_BGR2GRAY)
	std_dev = np.std(gray)

	# Only classify if there's visual complexity (not blank/solid color)
	if std_dev > 15: # Threshold for visual complexity
	result = self._classify_single_image(region_crop)

	image_regions.append({
	'id': region_id,
	'location': sample['name'],
	'bbox': {'x': x1, 'y': y1, 'width': x2-x1, 'height': y2-y1},
	'pytorch_class': result['class_name'],
	'confidence': result['confidence'],
	'top_5': result['top_5'],
	'type': 'image'
	})
	region_id += 1

	return image_regions

	def _classify_single_image(self, opencv_image):
	"""
	Classify single image region with PyTorch.

	ML CONCEPT: Inference Pipeline
	===============================
	1. Convert BGR → RGB → PIL
	2. Resize, crop, normalize (preprocessing)
	3. Convert to tensor, add batch dim
	4. Forward pass through ResNet18
	5. Softmax to get probabilities
	6. Return top predictions
	"""
	# Convert OpenCV (BGR) → PIL (RGB)
	rgb = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(rgb)

	# Preprocess
	tensor = self.transform(pil_image)
	tensor = tensor.unsqueeze(0) # Add batch dimension

	# Inference
	with torch.no_grad():
	outputs = self.pytorch_model(tensor)
	probs = torch.nn.functional.softmax(outputs[0], dim=0)

	# Get top 5
	top_probs, top_indices = torch.topk(probs, 5)

	top_class_id = top_indices[0].item()
	top_confidence = top_probs[0].item()

	# Get actual ImageNet class name
	if self.imagenet_classes and top_class_id < len(self.imagenet_classes):
	class_name = self.imagenet_classes[top_class_id]
	else:
	class_name = f"element_{top_class_id}"

	top_5 = [
	{
	'class_id': idx.item(),
	'class_name': self.imagenet_classes[idx.item()] if self.imagenet_classes and idx.item() < len(self.imagenet_classes) else f"class_{idx.item()}",
	'confidence': prob.item()
	}
	for idx, prob in zip(top_indices, top_probs)
	]

	return {
	'class_id': top_class_id,
	'class_name': class_name,
	'confidence': float(top_confidence),
	'top_5': top_5
	}