Spaces:

google
/

embeddinggemma-tuning-lab

Running

App Files Files Community

bebechien commited on Dec 10, 2025

Commit

beabfb7

verified ·

1 Parent(s): c5ec0d9

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +6 -5
app.py +4 -4
cli_mood_reader.py +3 -3
flask_app.py +2 -2
src/__init__.py +0 -0
src/config.py +56 -0
src/data_fetcher.py +112 -0
src/hn_mood_reader.py +71 -0
src/model_trainer.py +162 -0
src/vibe_logic.py +85 -0

README.md CHANGED Viewed

@@ -160,11 +160,12 @@ Key parameters can be adjusted in `config.py`:
 ├── app.py                  # Main Gradio application for fine-tuning
 ├── cli_mood_reader.py      # Interactive command-line mood reader
 ├── flask_app.py            # Standalone Flask application for mood reading
-├── hn_mood_reader.py       # Core logic for fetching and scoring (used by Flask/CLI)
-├── model_trainer.py        # Handles model loading and fine-tuning
-├── vibe_logic.py           # Calculates similarity scores and "vibe" status
-├── data_fetcher.py         # Fetches and caches the Hacker News RSS feed
-├── config.py               # Central configuration for all modules
 ├── requirements.txt        # Python package dependencies
 ├── README.md               # This file
 ├── artifacts/              # Stores session-specific fine-tuned models and datasets

 ├── app.py                  # Main Gradio application for fine-tuning
 ├── cli_mood_reader.py      # Interactive command-line mood reader
 ├── flask_app.py            # Standalone Flask application for mood reading
+├── src/                    # Source code for the application
+│   ├── config.py           # Central configuration for all modules
+│   ├── data_fetcher.py     # Fetches and caches the Hacker News RSS feed
+│   ├── hn_mood_reader.py   # Core logic for fetching and scoring
+│   ├── model_trainer.py    # Handles model loading and fine-tuning
+│   └── vibe_logic.py       # Calculates similarity scores and "vibe" status
 ├── requirements.txt        # Python package dependencies
 ├── README.md               # This file
 ├── artifacts/              # Stores session-specific fine-tuned models and datasets

app.py CHANGED Viewed

@@ -9,16 +9,16 @@ from typing import List, Iterable, Tuple, Optional, Callable
 from datetime import datetime
 # Import modules
-from data_fetcher import read_hacker_news_rss, format_published_time
-from model_trainer import (
     authenticate_hf,
     train_with_dataset,
     get_top_hits,
     load_embedding_model,
     upload_model_to_hub
 )
-from config import AppConfig
-from vibe_logic import VibeChecker
 from sentence_transformers import SentenceTransformer
 # --- Main Application Class (Session Scoped) ---

 from datetime import datetime
 # Import modules
+from src.data_fetcher import read_hacker_news_rss, format_published_time
+from src.model_trainer import (
     authenticate_hf,
     train_with_dataset,
     get_top_hits,
     load_embedding_model,
     upload_model_to_hub
 )
+from src.config import AppConfig
+from src.vibe_logic import VibeChecker
 from sentence_transformers import SentenceTransformer
 # --- Main Application Class (Session Scoped) ---

cli_mood_reader.py CHANGED Viewed

@@ -7,9 +7,9 @@ from typing import List
 # --- Core Logic Imports ---
 # These modules contain the application's functionality.
-from config import AppConfig
-from hn_mood_reader import HnMoodReader, FeedEntry
-from vibe_logic import VIBE_THRESHOLDS
 # --- Helper Functions ---

 # --- Core Logic Imports ---
 # These modules contain the application's functionality.
+from src.config import AppConfig
+from src.hn_mood_reader import HnMoodReader, FeedEntry
+from src.vibe_logic import VIBE_THRESHOLDS
 # --- Helper Functions ---

flask_app.py CHANGED Viewed

@@ -7,8 +7,8 @@ from typing import Optional
 from flask import Flask, render_template
 # Your existing config and core logic
-from config import AppConfig
-from hn_mood_reader import HnMoodReader, FeedEntry
 # --- Flask App Initialization ---
 app = Flask(__name__)

 from flask import Flask, render_template
 # Your existing config and core logic
+from src.config import AppConfig
+from src.hn_mood_reader import HnMoodReader, FeedEntry
 # --- Flask App Initialization ---
 app = Flask(__name__)

src/__init__.py ADDED Viewed

File without changes

src/config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from typing import Final
+from pathlib import Path
+# --- Base Directory Definition ---
+# Use Path for modern, OS-agnostic path handling
+ARTIFACTS_DIR: Final[Path] = Path("artifacts")
+class AppConfig:
+    """
+    Central configuration class for the Hacker News Fine-Tuner application.
+    """
+    # --- Directory/Environment Configuration ---
+    ARTIFACTS_DIR: Final[Path] = ARTIFACTS_DIR
+    # Environment variable for Hugging Face token (used by model_trainer)
+    HF_TOKEN: Final[str | None] = os.getenv('HF_TOKEN')
+    # --- Caching/Data Fetching Configuration ---
+    HN_RSS_URL: Final[str] = "https://news.ycombinator.com/rss"
+    # Filename for the pickled cache data (using Path.joinpath)
+    CACHE_FILE: Final[Path] = ARTIFACTS_DIR.joinpath("hacker_news_cache.pkl")
+    # Cache duration set to 30 minutes (1800 seconds)
+    CACHE_DURATION_SECONDS: Final[int] = 60 * 30
+    # --- Model/Training Configuration ---
+    # Name of the pre-trained embedding model
+    MODEL_NAME: Final[str] = 'google/embeddinggemma-300M'
+    # Task name for prompting the embedding model (e.g., for instruction tuning)
+    TASK_NAME: Final[str] = "Classification"
+    # Output directory for the fine-tuned model
+    OUTPUT_DIR: Final[Path] = ARTIFACTS_DIR.joinpath("embedding-gemma-finetuned-hn")
+    # --- Gradio/App-Specific Configuration ---
+    # Anchor text used for contrastive learning dataset generation
+    QUERY_ANCHOR: Final[str] = "MY_FAVORITE_NEWS"
+    # Number of titles shown for user selection in the Gradio interface
+    TOP_TITLES_COUNT: Final[int] = 10
+    # Default export path for the dataset CSV
+    DATASET_EXPORT_FILENAME: Final[Path] = ARTIFACTS_DIR.joinpath("training_dataset.csv")
+    # Default model for the standalone Mood Reader tab
+    DEFAULT_MOOD_READER_MODEL: Final[str] = "bebechien/embedding-gemma-finetuned-hn"

src/data_fetcher.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import feedparser
+import pickle
+import os
+import time
+from datetime import datetime
+from typing import Tuple, Any, Optional
+# Assuming AppConfig is passed in via dependency injection in the refactored main app.
+def format_published_time(published_parsed: Optional[time.struct_time]) -> str:
+    """Safely converts a feedparser time struct to a formatted string."""
+    if published_parsed:
+        try:
+            dt_obj = datetime.fromtimestamp(time.mktime(published_parsed))
+            return dt_obj.strftime('%Y-%m-%d %H:%M')
+        except Exception:
+            return 'N/A'
+    return 'N/A'
+def load_feed_from_cache(config: Any) -> Tuple[Optional[Any], str]:
+    """Attempts to load a feed object from the cache file if it exists and is not expired."""
+    if not os.path.exists(config.CACHE_FILE):
+        return None, "Cache file not found."
+    try:
+        # Check cache age
+        file_age_seconds = time.time() - os.path.getmtime(config.CACHE_FILE)
+        if file_age_seconds > config.CACHE_DURATION_SECONDS:
+            # The cache is too old
+            return None, f"Cache expired ({file_age_seconds:.0f}s old, limit is {config.CACHE_DURATION_SECONDS}s)."
+        with open(config.CACHE_FILE, 'rb') as f:
+            feed = pickle.load(f)
+            return feed, f"Loaded successfully from cache (Age: {file_age_seconds:.0f}s)."
+    except Exception as e:
+        # If loading fails, treat it as a miss and attempt to clean up
+        print(f"Warning: Failed to load cache file. Deleting corrupted cache. Reason: {e}")
+        try:
+            os.remove(config.CACHE_FILE)
+        except OSError:
+            pass # Ignore if removal fails
+        return None, "Cache file corrupted or invalid. Will re-fetch."
+def save_feed_to_cache(config: Any, feed: Any) -> None:
+    """Saves the fetched feed object to the cache file."""
+    try:
+        with open(config.CACHE_FILE, 'wb') as f:
+            pickle.dump(feed, f)
+        print(f"Successfully saved new feed data to cache: {config.CACHE_FILE}")
+    except Exception as e:
+        print(f"Error saving to cache: {e}")
+def read_hacker_news_rss(config: Any) -> Tuple[Optional[Any], str]:
+    """
+    Reads and parses the Hacker News RSS feed, using a cache if available.
+    Returns the feedparser object and a status message.
+    """
+    url = config.HN_RSS_URL
+    print(f"Attempting to fetch and parse RSS feed from: {url}")
+    print("-" * 50)
+    # 1. Attempt to load from cache
+    feed, cache_status = load_feed_from_cache(config)
+    print(f"Cache Status: {cache_status}")
+    # 2. If cache miss or stale, fetch from web
+    if feed is None:
+        print("Starting network fetch...")
+        try:
+            # Use feedparser to fetch and parse the feed
+            feed = feedparser.parse(url)
+            if feed.status >= 400:
+                status_msg = f"Error fetching the feed. HTTP Status: {feed.status}"
+                print(status_msg)
+                return None, status_msg
+            if feed.bozo:
+                # Bozo is set if any error occurred, even non-critical ones.
+                print(f"Warning: Failed to fully parse the feed. Reason: {feed.get('bozo_exception')}")
+            # 3. If fetch successful, save new data to cache
+            if feed.entries:
+                save_feed_to_cache(config, feed)
+                status_msg = f"Successfully fetched and cached {len(feed.entries)} entries."
+            else:
+                status_msg = "Fetch successful, but no entries found in the feed."
+                print(status_msg)
+                feed = None # Ensure feed is None if no entries
+        except Exception as e:
+            status_msg = f"An unexpected error occurred during network processing: {e}"
+            print(status_msg)
+            return None, status_msg
+    else:
+        status_msg = cache_status
+    return feed, status_msg
+# Example usage (not part of the refactored module's purpose but good for testing)
+if __name__ == '__main__':
+    from .config import AppConfig
+    feed, status = read_hacker_news_rss(AppConfig)
+    if feed and feed.entries:
+        print(f"\nFetched {len(feed.entries)} entries. Top 3 titles:")
+        for entry in feed.entries[:3]:
+            print(f"- {entry.title}")
+    else:
+        print(f"Could not fetch the feed. Status: {status}")

src/hn_mood_reader.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# hn_mood_reader.py
+import feedparser
+from datetime import datetime
+from dataclasses import dataclass
+from typing import List
+import os
+# Assuming these are in separate files as in the original structure
+from .config import AppConfig
+from .data_fetcher import format_published_time
+from .vibe_logic import VibeChecker, VibeResult
+# --- Data Structures ---
+@dataclass(frozen=True)
+class FeedEntry:
+    """Stores necessary data for a single HN story, including its calculated mood."""
+    title: str
+    link: str
+    comments_link: str
+    published_time_str: str
+    mood: VibeResult
+# --- Core Logic Class ---
+class HnMoodReader:
+    """Handles model initialization and mood scoring for Hacker News titles."""
+    def __init__(self, model_name: str):
+        try:
+            from sentence_transformers import SentenceTransformer
+        except ImportError as e:
+            raise ImportError("Please install 'sentence-transformers'") from e
+        print(f"Initializing SentenceTransformer with model: {model_name}...")
+        self.model = SentenceTransformer(model_name, truncate_dim=128)
+        print("Model initialized successfully.")
+        self.vibe_checker = VibeChecker(
+            model=self.model,
+            query_anchor=AppConfig.QUERY_ANCHOR,
+            task_name=AppConfig.TASK_NAME
+        )
+        self.model_name = model_name
+    def _get_mood_result(self, title: str) -> VibeResult:
+        """Calculates the mood for a title using the VibeChecker."""
+        return self.vibe_checker.check(title)
+    def fetch_and_score_feed(self) -> List[FeedEntry]:
+        """Fetches, scores, and sorts entries from the HN RSS feed."""
+        feed = feedparser.parse(AppConfig.HN_RSS_URL)
+        if feed.bozo:
+            raise IOError(f"Error parsing feed from {AppConfig.HN_RSS_URL}.")
+        scored_entries: List[FeedEntry] = []
+        for entry in feed.entries:
+            title, link = entry.get('title'), entry.get('link')
+            if not title or not link:
+                continue
+            scored_entries.append(
+                FeedEntry(
+                    title=title,
+                    link=link,
+                    comments_link=entry.get('comments', '#'),
+                    published_time_str=format_published_time(entry.published_parsed),
+                    mood=self._get_mood_result(title)
+                )
+            )
+        scored_entries.sort(key=lambda x: x.mood.raw_score, reverse=True)
+        return scored_entries

src/model_trainer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from huggingface_hub import login, HfApi # Updated import
+from sentence_transformers import SentenceTransformer, util
+from datasets import Dataset
+from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
+from sentence_transformers.losses import MultipleNegativesRankingLoss
+from transformers import TrainerCallback, TrainingArguments
+from typing import List, Callable, Optional
+from pathlib import Path
+# --- Model/Utility Functions ---
+def authenticate_hf(token: Optional[str]) -> None:
+    """Logs into the Hugging Face Hub."""
+    if token:
+        print("Logging into Hugging Face Hub...")
+        login(token=token)
+    else:
+        print("Skipping Hugging Face login: HF_TOKEN not set.")
+def load_embedding_model(model_name: str) -> SentenceTransformer:
+    """Initializes the Sentence Transformer model."""
+    print(f"Loading Sentence Transformer model: {model_name}")
+    try:
+        model = SentenceTransformer(model_name, model_kwargs={"device_map": "auto"})
+        print(f"Model loaded successfully. {model.device}")
+        return model
+    except Exception as e:
+        print(f"Error loading Sentence Transformer model {model_name}: {e}")
+        raise
+def get_top_hits(
+    model: SentenceTransformer,
+    target_titles: List[str],
+    task_name: str,
+    query: str = "MY_FAVORITE_NEWS",
+    top_k: int = 5
+) -> str:
+    """Performs semantic search on target_titles and returns a formatted result string."""
+    if not target_titles:
+        return "No target titles available for search."
+    # Encode the query
+    query_embedding = model.encode(query, prompt_name=task_name)
+    # Encode the target titles (only done once per call)
+    title_embeddings = model.encode(target_titles, prompt_name=task_name)
+    # Perform semantic search
+    top_hits = util.semantic_search(query_embedding, title_embeddings, top_k=top_k)[0]
+    result = []
+    for hit in top_hits:
+        title = target_titles[hit['corpus_id']]
+        score = hit['score']
+        result.append(f"[{title}] {score:.4f}")
+    return "\n".join(result)
+def upload_model_to_hub(folder_path: Path, repo_name: str, token: str) -> str:
+    """
+    Uploads a local model folder to the Hugging Face Hub.
+    Creates the repository if it doesn't exist.
+    """
+    try:
+        api = HfApi(token=token)
+        # Get the authenticated user's username
+        user_info = api.whoami()
+        username = user_info['name']
+        # Construct the full repo ID
+        repo_id = f"{username}/{repo_name}"
+        print(f"Preparing to upload to: {repo_id}")
+        # Create the repo (safe if it already exists)
+        api.create_repo(repo_id=repo_id, exist_ok=True)
+        # Upload the folder
+        url = api.upload_folder(
+            folder_path=folder_path,
+            repo_id=repo_id,
+            repo_type="model"
+        )
+        return f"✅ Success! Model published at: {url}"
+    except Exception as e:
+        print(f"Upload failed: {e}")
+        return f"❌ Upload failed: {str(e)}"
+# --- Training Class and Function ---
+class EvaluationCallback(TrainerCallback):
+    """
+    A callback that runs the semantic search evaluation at the end of each log step.
+    The search function is passed in during initialization.
+    """
+    def __init__(self, search_fn: Callable[[], str]):
+        self.search_fn = search_fn
+    def on_log(self, args: TrainingArguments, state, control, **kwargs):
+        print(f"Step {state.global_step} finished. Running evaluation:")
+        print(f"\n{self.search_fn()}\n")
+def train_with_dataset(
+    model: SentenceTransformer,
+    dataset: List[List[str]],
+    output_dir: Path,
+    task_name: str,
+    search_fn: Callable[[], str]
+) -> None:
+    """
+    Fine-tunes the provided Sentence Transformer MODEL on the dataset.
+    The dataset should be a list of lists: [[anchor, positive, negative], ...].
+    """
+    # Convert to Hugging Face Dataset format
+    data_as_dicts = [
+        {"anchor": row[0], "positive": row[1], "negative": row[2]}
+        for row in dataset
+    ]
+    train_dataset = Dataset.from_list(data_as_dicts)
+    # Use MultipleNegativesRankingLoss, suitable for contrastive learning
+    loss = MultipleNegativesRankingLoss(model)
+    # Note: SentenceTransformer models typically have a 'prompts' attribute
+    # which we need to access for the training arguments.
+    prompts = getattr(model, 'prompts', {}).get(task_name)
+    if not prompts:
+        print(f"Warning: Could not find prompts for task '{task_name}' in model. Training may be less effective.")
+        # Fallback to an empty list or appropriate default if required by the model's structure
+        prompts = []
+    args = SentenceTransformerTrainingArguments(
+        output_dir=output_dir,
+        prompts=prompts,
+        num_train_epochs=4,
+        per_device_train_batch_size=1,
+        learning_rate=2e-5,
+        warmup_ratio=0.1,
+        logging_steps=train_dataset.num_rows,
+        report_to="none",
+        save_strategy="no" # No saving during training, only at the end
+    )
+    trainer = SentenceTransformerTrainer(
+        model=model,
+        args=args,
+        train_dataset=train_dataset,
+        loss=loss,
+        callbacks=[EvaluationCallback(search_fn)]
+    )
+    trainer.train()
+    print("Training finished. Model weights are updated in memory.")
+    # Save the final fine-tuned model
+    trainer.save_model()
+    print(f"Model saved locally to: {output_dir}")

src/vibe_logic.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from dataclasses import dataclass
+from math import floor
+from typing import List
+from sentence_transformers import SentenceTransformer, util
+# --- Data Structures ---
+@dataclass(frozen=True)
+class VibeThreshold:
+    """Defines a threshold for a Vibe status."""
+    score: float
+    status: str
+@dataclass(frozen=True)
+class VibeResult:
+    """Stores the calculated HSL color and status for a given score."""
+    raw_score: float
+    status_html: str  # Pre-formatted HTML for display
+    color_hsl: str    # Raw HSL color string
+# Define the status thresholds from highest score to lowest score
+VIBE_THRESHOLDS: List[VibeThreshold] = [
+    VibeThreshold(score=0.8, status="✨ VIBE:HIGH"),
+    VibeThreshold(score=0.5, status="👍 VIBE:GOOD"),
+    VibeThreshold(score=0.2, status="😐 VIBE:FLAT"),
+    VibeThreshold(score=0.0, status="👎 VIBE:LOW&nbsp;"),  # Base case for scores < 0.2
+]
+# --- Utility Functions ---
+def map_score_to_vibe(score: float) -> VibeResult:
+    """
+    Maps a cosine similarity score to a VibeResult containing status, HTML, and color.
+    """
+    # 1. Clamp score for safety
+    clamped_score = max(0.0, min(1.0, score))
+    # 2. Color Calculation
+    hue = floor(clamped_score * 120)  # Linear interpolation: 0 (Red) -> 120 (Green)
+    color_hsl = f"hsl({hue}, 80%, 50%)"
+    # 3. Status Determination
+    status_text: str = VIBE_THRESHOLDS[-1].status  # Default to the lowest status
+    for threshold in VIBE_THRESHOLDS:
+        if clamped_score >= threshold.score:
+            status_text = threshold.status
+            break
+    # 4. Create the pre-formatted HTML for display
+    status_html = f"<span style='color: {color_hsl}; font-weight: bold;'>{status_text}</span>"
+    return VibeResult(raw_score=score, status_html=status_html, color_hsl=color_hsl)
+# --- Core Logic Class ---
+class VibeChecker:
+    """
+    Handles similarity scoring using a SentenceTransformer model and a pre-set anchor query.
+    """
+    def __init__(self, model: SentenceTransformer, query_anchor: str, task_name: str):
+        self.model = model
+        self.query_anchor = query_anchor
+        self.task_name = task_name
+        # Pre-calculate the anchor embedding for efficiency
+        self.query_embedding = self.model.encode(
+            self.query_anchor,
+            prompt_name=self.task_name,
+            normalize_embeddings=True
+        )
+    def check(self, text: str) -> VibeResult:
+        """
+        Calculates the "vibe" of a given text against the pre-configured anchor.
+        """
+        title_embedding = self.model.encode(
+            text,
+            prompt_name=self.task_name,
+            normalize_embeddings=True
+        )
+        # Use dot product for similarity with normalized embeddings
+        score: float = util.dot_score(self.query_embedding, title_embedding).item()
+        return map_score_to_vibe(score)