""" HuggingFace Enabling Sessions - Gradio Interactive Demo App Hosted on HuggingFace Spaces """ import gradio as gr import config import utils import pandas as pd try: import spaces except Exception: class _SpacesFallback: @staticmethod def GPU(func=None, *args, **kwargs): if func is None: def decorator(inner_func): return inner_func return decorator return func spaces = _SpacesFallback() # ===================== UTILITIES ===================== def load_sample_texts(): """Load sample texts from CSV.""" try: df = pd.read_csv(config.SAMPLE_DATA_CSV) return df except: return None def get_sentiment_examples(): """Get example texts for sentiment analysis.""" try: with open(f"{config.DEMO_SAMPLES_DIR}/sentiment.txt") as f: lines = f.read().strip().split("\n") return lines except: return config.TASKS["sentiment"]["example"].split() def get_ner_examples(): """Get example texts for NER.""" try: with open(f"{config.DEMO_SAMPLES_DIR}/ner.txt") as f: lines = f.read().strip().split("\n") return lines except: return [config.TASKS["ner"]["example"]] def get_qa_examples(): """Get example context and questions for QA.""" try: with open(f"{config.DEMO_SAMPLES_DIR}/qa.txt") as f: contexts = f.read().strip().split("\n\n") return contexts except: return [config.TASKS["qa"]["example_context"]] def get_summarization_examples(): """Get example texts for summarization.""" try: with open(f"{config.DEMO_SAMPLES_DIR}/summarization.txt") as f: lines = f.read().strip().split("\n") return lines except: return [config.TASKS["summarization"]["example"]] def get_embeddings_examples(): """Get example texts for semantic similarity.""" try: with open(f"{config.DEMO_SAMPLES_DIR}/embeddings.txt") as f: lines = f.read().strip().split("\n") return lines except: return [config.TASKS["similarity"]["example1"], config.TASKS["similarity"]["example2"]] # ===================== SENTIMENT ANALYSIS ===================== @spaces.GPU def demo_sentiment(text): """Demo sentiment analysis.""" if not text.strip(): return "Please enter some text", "No input" result = utils.run_sentiment_analysis(text) output = f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.4f}" return output, result # ===================== NER ===================== @spaces.GPU def demo_ner(text): """Demo named entity recognition.""" if not text.strip(): return "Please enter some text", "No entities" results = utils.run_ner(text) if results and isinstance(results, list) and isinstance(results[0], dict) and "error" in results[0]: return f"Error: {results[0]['error']}", {"error": results[0]["error"]} formatted = utils.format_ner_output(results) return formatted, results # ===================== QUESTION ANSWERING ===================== @spaces.GPU def demo_qa(context, question): """Demo question answering.""" if not context.strip() or not question.strip(): return "Please enter both context and question", {} result = utils.run_qa(context, question) if "error" in result: return f"Error: {result['error']}", {} output = f"**Answer:** {result['answer']}\n\n**Confidence:** {result['score']:.4f}" return output, result # ===================== SUMMARIZATION ===================== @spaces.GPU def demo_summarization(text): """Demo text summarization.""" if not text.strip(): return "Please enter some text", {"error": "Please enter some text"} if len(text.split()) < 20: return "Text too short for summarization. Please provide at least 20 words.", { "error": "Text too short for summarization. Please provide at least 20 words." } summary = utils.run_summarization(text) if summary.startswith("Error:"): return summary, {"error": summary} return summary, {"summary": summary} # ===================== SEMANTIC SIMILARITY ===================== @spaces.GPU def demo_similarity(text1, text2): """Demo semantic similarity.""" if not text1.strip() or not text2.strip(): return "Please enter both texts", 0 similarity = utils.compute_similarity(text1, text2) if isinstance(similarity, str): return similarity, 0 output = f"**Similarity Score:** {similarity:.4f}\n\n(Score ranges from -1 to 1, where 1 means identical semantically)" return output, similarity # ===================== TOKENIZATION ===================== @spaces.GPU def demo_tokenization(text): """Demo tokenization.""" if not text.strip(): return "Please enter some text", "" result = utils.tokenize_text(text) if "error" in result: return f"Error: {result['error']}", "" formatted = utils.format_tokenizer_output(result) return formatted, result # ===================== GRADIO INTERFACE ===================== def create_interface(): """Create the Gradio interface with 3 tabs.""" with gr.Blocks( title="HuggingFace Enabling Sessions", ) as app: gr.Markdown( """ # 🤗 HuggingFace Enabling Sessions **Interactive Demo for Transformers, Hub APIs, and Pipeline Abstractions** **Duration:** Session 1: 45 min | Session 2: 90 min """ ) with gr.Tabs(): # ===================== TAB 1: SESSION 1 - INTRODUCTION ===================== with gr.Tab("Session 1: Introduction (45 min)", id="session1"): gr.Markdown( """ ## 🎯 Introduction to Hugging Face Ecosystem ### What We'll Cover: 1. **HuggingFace Platform Overview** - The Hub: Central repository for models, datasets, and spaces - Transformers Library: Core Python library for NLP - Model Cards: Documentation and metadata for transparency 2. **Core Abstractions** - **Pipelines:** High-level API for common tasks (sentiment, NER, QA, etc.) - **Models & Tokenizers:** Lower-level building blocks - **Datasets:** Standardized data loading and processing 3. **Architecture Patterns** - **Encoders:** BERT, RoBERTa, DistilBERT → Classification, feature extraction - **Decoders:** GPT-2, GPT-3 → Text generation - **Encoder-Decoders:** T5, BART → Seq2seq (translation, summarization, QA) 4. **Enterprise NLP Landscape** - Open-source vs. Commercial models - Licensing considerations (MIT, Apache, OpenRAIL, etc.) - Fine-tuning for domain-specific tasks --- ### Live Demo: Explore the Power of Pipelines Try the demos below to see how easy it is to use pre-trained models! 👇 """ ) with gr.Group(): gr.Markdown("### 📊 Demo 1: Sentiment Analysis") demo1_input = gr.Textbox( label="Enter text to analyze sentiment", value="I absolutely love this product!", lines=2, ) demo1_btn = gr.Button("Analyze Sentiment", variant="primary") demo1_output = gr.Markdown(label="Result") demo1_json = gr.JSON(label="Raw Output", visible=False) demo1_btn.click( demo_sentiment, inputs=[demo1_input], outputs=[demo1_output, demo1_json], ) with gr.Group(): gr.Markdown("### 🏷️ Demo 2: Named Entity Recognition (NER)") demo2_input = gr.Textbox( label="Enter text for entity recognition", value="Apple Inc. was founded by Steve Jobs in Cupertino, California.", lines=2, ) demo2_btn = gr.Button("Extract Entities", variant="primary") demo2_output = gr.Markdown(label="Entities Found") demo2_json = gr.JSON(label="Raw Output", visible=False) demo2_btn.click( demo_ner, inputs=[demo2_input], outputs=[demo2_output, demo2_json], ) gr.Markdown( """ --- ### 💡 Key Takeaways - Pre-trained models save time and resources - HuggingFace Pipelines abstract away complexity - Models are available for dozens of NLP tasks - Easy to fine-tune for specialized use cases **Next:** Head to Session 2 for hands-on development with Tokenizers and Advanced Inference! 🚀 """ ) # ===================== TAB 2: SESSION 2 - HANDS-ON DEVELOPER ===================== with gr.Tab("Session 2: Hands-On Developer (90 min)", id="session2"): gr.Markdown( """ ## 👨‍💻 Building End-to-End NLP Workflows with Hugging Face ### Agenda: 1. **Tokenization Deep Dive** (15 min) - Understanding tokenization, token IDs, and attention masks - How models process text internally 2. **Inference Playground** (45 min) - Interactive demos across multiple NLP tasks - Learn how to use different model architectures - See real outputs and understand model confidence 3. **Exercise Checkpoints** (20 min) - Try your own text inputs - Experiment with different examples - Q&A and troubleshooting 4. **Next Steps & Resources** (10 min) - Publishing models to the Hub - Fine-tuning workflow overview - Post-session project ideas --- ### 🔤 Part 1: Tokenization Explorer """ ) with gr.Group(): gr.Markdown( """ #### How Tokenization Works - Text is split into tokens (words/subwords) - Each token gets a unique ID - Attention masks indicate which tokens are real vs. padding - This is how transformers \"understand\" text! """ ) tok_input = gr.Textbox( label="Enter text to tokenize", value="Hello, how are you?", lines=2, ) tok_btn = gr.Button("Tokenize", variant="primary") tok_output = gr.Markdown(label="Tokens") tok_json = gr.JSON(label="Tokenization Details", visible=False) tok_btn.click( demo_tokenization, inputs=[tok_input], outputs=[tok_output, tok_json], ) gr.Markdown( """ --- ### 🎯 Part 2: Inference Playground (Choose a Task) """ ) with gr.Tabs(): # Task 1: Sentiment with gr.Tab("Sentiment Analysis"): gr.Markdown( """ **Classify text as positive, negative, or neutral** Model: DistilBERT fine-tuned on SST-2 dataset """ ) sent_input = gr.Textbox( label="Enter text", value=get_sentiment_examples()[0] if get_sentiment_examples() else "I love this!", lines=3, ) sent_btn = gr.Button("Analyze", variant="primary") sent_output = gr.Markdown(label="Result") sent_json = gr.JSON(label="Details", visible=False) sent_btn.click( demo_sentiment, inputs=[sent_input], outputs=[sent_output, sent_json], ) # Task 2: NER with gr.Tab("Named Entity Recognition"): gr.Markdown( """ **Identify people, organizations, locations, and more** Model: BERT fine-tuned on CoNLL-2003 NER dataset """ ) ner_input = gr.Textbox( label="Enter text", value=get_ner_examples()[0] if get_ner_examples() else "Apple Inc. was founded by Steve Jobs", lines=3, ) ner_btn = gr.Button("Extract Entities", variant="primary") ner_output = gr.Markdown(label="Entities") ner_json = gr.JSON(label="Details", visible=False) ner_btn.click( demo_ner, inputs=[ner_input], outputs=[ner_output, ner_json], ) # Task 3: QA with gr.Tab("Question Answering"): gr.Markdown( """ **Answer questions based on provided context** Model: RoBERTa fine-tuned on SQuAD 2.0 """ ) qa_examples = get_qa_examples() qa_context = gr.Textbox( label="Context/Passage", value=qa_examples[0] if qa_examples else config.TASKS["qa"]["example_context"], lines=4, ) qa_question = gr.Textbox( label="Question", value="What is the Hugging Face Hub?", lines=2, ) qa_btn = gr.Button("Get Answer", variant="primary") qa_output = gr.Markdown(label="Answer") qa_json = gr.JSON(label="Details", visible=False) qa_btn.click( demo_qa, inputs=[qa_context, qa_question], outputs=[qa_output, qa_json], ) # Task 4: Summarization with gr.Tab("Text Summarization"): gr.Markdown( """ **Generate concise summaries of longer texts** Model: BART large fine-tuned on CNN/DailyMail """ ) sum_examples = get_summarization_examples() sum_input = gr.Textbox( label="Text to summarize (min 20 words)", value=sum_examples[0] if sum_examples else config.TASKS["summarization"]["example"], lines=5, ) sum_btn = gr.Button("Summarize", variant="primary") sum_output = gr.Markdown(label="Summary") sum_json = gr.JSON(label="Details", visible=False) sum_btn.click( demo_summarization, inputs=[sum_input], outputs=[sum_output, sum_json], ) # Task 5: Semantic Similarity with gr.Tab("Semantic Similarity"): gr.Markdown( """ **Compare semantic similarity between texts** Model: Sentence-BERT (all-MiniLM-L6-v2) """ ) emb_examples = get_embeddings_examples() emb_text1 = gr.Textbox( label="First text", value=emb_examples[0] if len(emb_examples) > 0 else "The cat is sleeping", lines=2, ) emb_text2 = gr.Textbox( label="Second text", value=emb_examples[1] if len(emb_examples) > 1 else "A feline is resting", lines=2, ) emb_btn = gr.Button("Compare", variant="primary") emb_output = gr.Markdown(label="Similarity") emb_json = gr.JSON(label="Details", visible=False) emb_btn.click( demo_similarity, inputs=[emb_text1, emb_text2], outputs=[emb_output, emb_json], ) gr.Markdown( """ --- ### 🚀 Part 3: Key Concepts Recap ✅ **Transformers Architecture:** - Self-attention mechanisms allow models to focus on relevant parts of text - Pre-training on large corpora + fine-tuning = transfer learning ✅ **Using HuggingFace:** - Pipelines for quick demos - Fine-tuning for custom tasks - Model Hub for sharing and collaboration ✅ **Production Considerations:** - Model size vs. accuracy tradeoff - Quantization and distillation for faster inference - Licensing and compliance for models """ ) # ===================== TAB 3: RESOURCES ===================== with gr.Tab("Resources & Next Steps", id="resources"): gr.Markdown( """ ## 📚 Learning Resources ### Official Documentation - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/) - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets/) - [Hugging Face Hub Documentation](https://huggingface.co/docs/hub/) ### Tutorials & Courses - [Hugging Face Course (Free)](https://huggingface.co/course/) - [Transformers from Scratch](https://huggingface.co/docs/transformers/training) - [Fine-tuning Guide](https://huggingface.co/docs/transformers/training) --- ## 🛠️ Popular Models to Explore ### Text Classification - `distilbert-base-uncased-finetuned-sst-2-english` - Sentiment Analysis - `roberta-base` - General purpose classifier - `bert-base-multilingual-cased` - Multilingual support ### Named Entity Recognition - `dslim/bert-base-NER` - English NER - `xlm-roberta-base` - Multilingual NER ### Question Answering - `deepset/roberta-base-squad2` - SQuAD 2.0 fine-tuned - `bert-large-uncased-whole-word-masking-finetuned-squad` - BERT Large ### Text Generation - `gpt2` - Lightweight generation - `facebook/bart-large` - Sequence-to-sequence - `google/t5-base` - T5 for various tasks ### Embeddings & Similarity - `sentence-transformers/all-MiniLM-L6-v2` - Fast & efficient - `sentence-transformers/all-mpnet-base-v2` - High quality --- ## 💾 Popular Datasets - `glue` - General Language Understanding Evaluation - `wikitext` - Large language model benchmark - `squad` - Question answering dataset - `conll2003` - Named entity recognition - `imdb` - Sentiment analysis --- ## 🎯 Next Steps After the Sessions ### Beginner Path 1. Explore models on the Hub 2. Try different models on your own data 3. Learn about fine-tuning concepts ### Intermediate Path 1. Fine-tune a pre-trained model on your dataset 2. Deploy a model to Spaces (like this demo!) 3. Publish your model to the Hub ### Advanced Path 1. Build multi-stage pipelines 2. Implement custom training loops 3. Contribute to open-source projects --- ## 🔗 Community & Support - [Hugging Face Forums](https://discuss.huggingface.co/) - [GitHub Issues](https://github.com/huggingface/transformers/issues) - [Twitter/X @huggingface](https://twitter.com/huggingface) - Company Slack/Teams Channels --- ## 📝 Session Information **Session 1: Introduction to Hugging Face** (45 minutes) - Overview of the ecosystem - Core abstractions (Pipelines, Models, Tokenizers) - Architecture patterns - Enterprise considerations **Session 2: Hands-On Developer Workshop** (90 minutes) - Tokenization deep dive - Interactive inference playground (5+ NLP tasks) - Live coding and experimentation - Best practices and next steps --- ### Questions? Feel free to reach out via Slack or email during the sessions! 💬 """ ) return app app = create_interface() # ===================== MAIN ===================== if __name__ == "__main__": app.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False, )