IJ-Reynolds HF Staff commited on
Commit
f3975b8
·
verified ·
1 Parent(s): 7effdd7

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +33 -20
main.py CHANGED
@@ -159,31 +159,44 @@ def fetch_congress_scraped():
159
  if len(title) < 15 or not is_relevant(title): continue
160
  seen_links.add(full_url)
161
 
162
- # --- AGGRESSIVE DATE HUNTING ---
163
  fmt_date = None
164
- current_node = a_tag
165
 
166
- # Climb up the DOM tree up to 5 levels to find the date stamp
167
- for _ in range(5):
168
- if current_node.parent:
169
- current_node = current_node.parent
170
- node_text = current_node.get_text(" ", strip=True)
171
- found_date = extract_robust_date([node_text])
172
- if found_date:
173
- fmt_date = found_date
174
- break
175
-
176
- # If still no date, check previous text nodes entirely
177
  if not fmt_date:
178
- prev_text = a_tag.find_previous(string=True)
179
- fmt_date = extract_robust_date([prev_text]) if prev_text else None
180
-
181
- # Only fallback to today if absolutely completely missing
182
- fmt_date = fmt_date or datetime.now()
183
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  results.append({
185
  "source": name, "type": "Legislative Office Press Release",
186
- "event_date": fmt_date, "time": "Published", "title": title,
 
187
  "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
188
  })
189
  time.sleep(1)
 
159
  if len(title) < 15 or not is_relevant(title): continue
160
  seen_links.add(full_url)
161
 
162
+ # --- UPGRADED DATE HUNTING (Container Search) ---
163
  fmt_date = None
 
164
 
165
+ # 1. Look for the entire row/article container (Catches sibling dates in Drupal!)
166
+ container = a_tag.find_parent(["article", "tr", "li"])
167
+ if not container:
168
+ container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post", re.I))
169
+
170
+ if container:
171
+ fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
172
+
173
+ # 2. Fallback DOM climb
 
 
174
  if not fmt_date:
175
+ current_node = a_tag
176
+ for _ in range(5):
177
+ if current_node.parent:
178
+ current_node = current_node.parent
179
+ found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
180
+ if found_date:
181
+ fmt_date = found_date
182
+ break
183
+
184
+ # --- THE USER-FACING MISSING DATE FLAG ---
185
+ if not fmt_date:
186
+ # We stop defaulting to today! Mark it explicitly for the user.
187
+ display_time = "⚠️ DATE UNKNOWN"
188
+ display_title = f"[DATE MISSING] {title}"
189
+ else:
190
+ # If we found a date, run the age gate
191
+ days_old = (datetime.now() - fmt_date).days
192
+ if days_old > 60: continue
193
+ display_time = "Published"
194
+ display_title = title
195
+
196
  results.append({
197
  "source": name, "type": "Legislative Office Press Release",
198
+ "event_date": fmt_date, # This will be passed as None (Blank) instead of today
199
+ "time": display_time, "title": display_title,
200
  "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
201
  })
202
  time.sleep(1)