Removed the hastags
Browse files
app.py
CHANGED
|
@@ -66,14 +66,27 @@ class ArticleScraperState(TypedDict):
|
|
| 66 |
|
| 67 |
# Helper function to detect English language
|
| 68 |
def is_english(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
try:
|
|
|
|
| 70 |
return detect(text) == 'en'
|
| 71 |
except:
|
| 72 |
-
# If detection fails,
|
| 73 |
-
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that'
|
|
|
|
| 74 |
text_lower = text.lower()
|
|
|
|
| 75 |
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# News search functions
|
| 79 |
def search_ai_news(state: NewsState):
|
|
@@ -362,9 +375,8 @@ def llm_call(state: WorkerState):
|
|
| 362 |
|
| 363 |
section = state['section']
|
| 364 |
|
| 365 |
-
# Generate section header
|
| 366 |
-
|
| 367 |
-
section_header = f"## {section.name} {{#{section_id}}}\n\n{section.description}\n"
|
| 368 |
|
| 369 |
# If there are subsections, process each one
|
| 370 |
subsections_content = ""
|
|
@@ -388,11 +400,8 @@ Keep your response focused on the news item and make it engaging. Use markdown f
|
|
| 388 |
HumanMessage(content=subsection_prompt)
|
| 389 |
])
|
| 390 |
|
| 391 |
-
#
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
# Format subsection with title and source
|
| 395 |
-
formatted_subsection = f"### {subsection.title} {{#{subsection_id}}}\n\n"
|
| 396 |
formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
|
| 397 |
formatted_subsection += subsection_content.content
|
| 398 |
|
|
@@ -434,14 +443,15 @@ def synthesizer(state: BlogState):
|
|
| 434 |
table_of_contents = "## Table of Contents\n\n"
|
| 435 |
|
| 436 |
# Find all section headings (## headings)
|
| 437 |
-
section_matches = re.findall(r'## (
|
| 438 |
|
| 439 |
-
for i,
|
| 440 |
-
# Add section to TOC
|
| 441 |
-
|
|
|
|
|
|
|
| 442 |
|
| 443 |
# Find all subsections within this section
|
| 444 |
-
# Look for subsection headings (### headings) until the next section or end of text
|
| 445 |
section_start = completed_report.find(f"## {section_name}")
|
| 446 |
next_section_match = re.search(r'## ', completed_report[section_start+1:])
|
| 447 |
if next_section_match:
|
|
@@ -450,12 +460,14 @@ def synthesizer(state: BlogState):
|
|
| 450 |
else:
|
| 451 |
section_text = completed_report[section_start:]
|
| 452 |
|
| 453 |
-
# Extract subsection headings
|
| 454 |
-
subsection_matches = re.findall(r'### (
|
| 455 |
|
| 456 |
-
for j,
|
|
|
|
|
|
|
| 457 |
# Add subsection to TOC with proper indentation
|
| 458 |
-
table_of_contents += f" {i}.{j}. [{subsection_name}](#{
|
| 459 |
|
| 460 |
final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
|
| 461 |
|
|
@@ -559,6 +571,10 @@ def generate_ai_news_blog(groq_api_key=None, tavily_api_key=None, date=None):
|
|
| 559 |
"content": result["article_content"]
|
| 560 |
})
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
# Format news content for the blog generator
|
| 563 |
formatted_content = "\n\n".join([
|
| 564 |
f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
|
|
@@ -599,15 +615,35 @@ def create_gradio_interface():
|
|
| 599 |
tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
|
| 600 |
date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
|
| 601 |
value=datetime.now().strftime("%Y-%m-%d"))
|
| 602 |
-
|
|
|
|
|
|
|
| 603 |
|
| 604 |
with gr.Column():
|
|
|
|
| 605 |
output_md = gr.Markdown("Your AI News Blog will appear here.")
|
| 606 |
|
|
|
|
| 607 |
generate_button.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
fn=run_generation,
|
| 609 |
inputs=[groq_key, tavily_key, date_picker],
|
| 610 |
outputs=output_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
)
|
| 612 |
|
| 613 |
return demo
|
|
|
|
| 66 |
|
| 67 |
# Helper function to detect English language
|
| 68 |
def is_english(text):
|
| 69 |
+
# Ensure we have enough text to analyze
|
| 70 |
+
if not text or len(text.strip()) < 50:
|
| 71 |
+
return False
|
| 72 |
+
|
| 73 |
try:
|
| 74 |
+
# Try primary language detection
|
| 75 |
return detect(text) == 'en'
|
| 76 |
except:
|
| 77 |
+
# If detection fails, use a more robust approach
|
| 78 |
+
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
|
| 79 |
+
'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
|
| 80 |
text_lower = text.lower()
|
| 81 |
+
# Count occurrences of common English words
|
| 82 |
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
|
| 83 |
+
# Calculate ratio of English words to text length
|
| 84 |
+
text_words = len(text_lower.split())
|
| 85 |
+
if text_words == 0: # Avoid division by zero
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
|
| 89 |
+
return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria
|
| 90 |
|
| 91 |
# News search functions
|
| 92 |
def search_ai_news(state: NewsState):
|
|
|
|
| 375 |
|
| 376 |
section = state['section']
|
| 377 |
|
| 378 |
+
# Generate section header without ID for cleaner markdown
|
| 379 |
+
section_header = f"## {section.name}\n\n{section.description}\n"
|
|
|
|
| 380 |
|
| 381 |
# If there are subsections, process each one
|
| 382 |
subsections_content = ""
|
|
|
|
| 400 |
HumanMessage(content=subsection_prompt)
|
| 401 |
])
|
| 402 |
|
| 403 |
+
# Format subsection with title and source (without ID tags)
|
| 404 |
+
formatted_subsection = f"### {subsection.title}\n\n"
|
|
|
|
|
|
|
|
|
|
| 405 |
formatted_subsection += f"*Source: [{subsection.source}]({subsection.url})*\n\n"
|
| 406 |
formatted_subsection += subsection_content.content
|
| 407 |
|
|
|
|
| 443 |
table_of_contents = "## Table of Contents\n\n"
|
| 444 |
|
| 445 |
# Find all section headings (## headings)
|
| 446 |
+
section_matches = re.findall(r'## ([^\n]+)', completed_report)
|
| 447 |
|
| 448 |
+
for i, section_name in enumerate(section_matches, 1):
|
| 449 |
+
# Add section to TOC with auto-generated link
|
| 450 |
+
# Create a clean anchor from the section name
|
| 451 |
+
section_anchor = section_name.lower().replace(' ', '-')
|
| 452 |
+
table_of_contents += f"{i}. [{section_name}](#{section_anchor})\n"
|
| 453 |
|
| 454 |
# Find all subsections within this section
|
|
|
|
| 455 |
section_start = completed_report.find(f"## {section_name}")
|
| 456 |
next_section_match = re.search(r'## ', completed_report[section_start+1:])
|
| 457 |
if next_section_match:
|
|
|
|
| 460 |
else:
|
| 461 |
section_text = completed_report[section_start:]
|
| 462 |
|
| 463 |
+
# Extract subsection headings
|
| 464 |
+
subsection_matches = re.findall(r'### ([^\n]+)', section_text)
|
| 465 |
|
| 466 |
+
for j, subsection_name in enumerate(subsection_matches, 1):
|
| 467 |
+
# Create a clean anchor from the subsection name
|
| 468 |
+
subsection_anchor = subsection_name.lower().replace(' ', '-').replace(':', '').replace('?', '').replace('!', '').replace('.', '')
|
| 469 |
# Add subsection to TOC with proper indentation
|
| 470 |
+
table_of_contents += f" {i}.{j}. [{subsection_name}](#{subsection_anchor})\n"
|
| 471 |
|
| 472 |
final_report = f"{blog_title}\n\n{intro.content}\n\n{table_of_contents}\n\n---\n\n{completed_report}\n\n---\n\n*This AI News Roundup was automatically generated on {today}.*"
|
| 473 |
|
|
|
|
| 571 |
"content": result["article_content"]
|
| 572 |
})
|
| 573 |
|
| 574 |
+
# Check if we have any news items
|
| 575 |
+
if not news_contents:
|
| 576 |
+
return "No English language AI news items found for the specified date. Please try a different date."
|
| 577 |
+
|
| 578 |
# Format news content for the blog generator
|
| 579 |
formatted_content = "\n\n".join([
|
| 580 |
f"TITLE: {item['title']}\nSOURCE: {item['source']}\nURL: {item['url']}\nDESCRIPTION: {item['description']}\nCONTENT: {item['content'][:2000]}..."
|
|
|
|
| 615 |
tavily_key = gr.Textbox(label="Tavily API Key", placeholder="Enter your Tavily API key", type="password")
|
| 616 |
date_picker = gr.Textbox(label="Date (YYYY-MM-DD)", placeholder="Leave empty for today's date",
|
| 617 |
value=datetime.now().strftime("%Y-%m-%d"))
|
| 618 |
+
with gr.Row():
|
| 619 |
+
generate_button = gr.Button("Generate AI News Blog", variant="primary")
|
| 620 |
+
clear_button = gr.Button("Clear Output")
|
| 621 |
|
| 622 |
with gr.Column():
|
| 623 |
+
status_text = gr.Textbox(label="Status", placeholder="Ready to generate", interactive=False)
|
| 624 |
output_md = gr.Markdown("Your AI News Blog will appear here.")
|
| 625 |
|
| 626 |
+
# Add loading state and status updates
|
| 627 |
generate_button.click(
|
| 628 |
+
fn=lambda: "Generating AI News Blog... This may take several minutes.",
|
| 629 |
+
inputs=None,
|
| 630 |
+
outputs=status_text,
|
| 631 |
+
queue=False
|
| 632 |
+
).then(
|
| 633 |
fn=run_generation,
|
| 634 |
inputs=[groq_key, tavily_key, date_picker],
|
| 635 |
outputs=output_md
|
| 636 |
+
).then(
|
| 637 |
+
fn=lambda: "Blog generation complete!",
|
| 638 |
+
inputs=None,
|
| 639 |
+
outputs=status_text
|
| 640 |
+
)
|
| 641 |
+
|
| 642 |
+
# Clear output
|
| 643 |
+
clear_button.click(
|
| 644 |
+
fn=lambda: ("Ready to generate", ""),
|
| 645 |
+
inputs=None,
|
| 646 |
+
outputs=[status_text, output_md]
|
| 647 |
)
|
| 648 |
|
| 649 |
return demo
|