Spaces:
Running
Running
| import unicodedata | |
| import json | |
| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| from typing import Union, List, Dict, Optional | |
| def clean_text(text: str) -> str: | |
| """Clean text from problematic characters.""" | |
| if not text: | |
| return text | |
| replacements = { | |
| ''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"', | |
| '–': '-', '—': '-', '…': '...', | |
| '\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"', | |
| '\u2013': '-', '\u2014': '-', '\u2026': '...', | |
| '\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'", | |
| '\u0301': "", '\u0060': "'", '\u00B4': "'" | |
| } | |
| try: | |
| # Normalize to NFKD and handle character replacements | |
| text = unicodedata.normalize('NFKD', text) | |
| # Handle character replacements | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| # Remove HTML tags and entities | |
| # Specifically targeting </p> <p> and other remnants | |
| text = re.sub(r'</p>\s*<p>', ' ', text, flags=re.IGNORECASE) | |
| text = re.sub(r'<[^>]+>', ' ', text) | |
| # Handle common HTML entities | |
| entities = { | |
| ' ': ' ', '"': '"', '&': '&', | |
| '<': '<', '>': '>', ''': "'" | |
| } | |
| for ent, rep in entities.items(): | |
| text = text.replace(ent, rep) | |
| # Remove control characters and normalize whitespace | |
| text = ' '.join(text.split()) | |
| text = ''.join(char for char in text | |
| if not unicodedata.category(char).startswith('C')) | |
| return text | |
| except Exception as e: | |
| print(f"Error in clean_text: {str(e)}") | |
| return text | |
| def extract_court_decision_text(url: str) -> str: | |
| """Extract text from court decision URL - специфічно для reyestr.court.gov.ua.""" | |
| try: | |
| # Add headers and timeout for better reliability | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| raise Exception(f"Помилка при завантаженні URL: {str(e)}") | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| unwanted_texts = [ | |
| "Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.", | |
| "З метою упередження перешкоджанню стабільній роботі Реєстру" | |
| ] | |
| result = "" | |
| # Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific) | |
| txtdepository = soup.find('textarea', id='txtdepository') | |
| if txtdepository: | |
| # The textarea contains HTML content as text | |
| embedded_html = txtdepository.get_text() | |
| # Parse the embedded HTML | |
| embedded_soup = BeautifulSoup(embedded_html, 'html.parser') | |
| # Extract text from paragraphs | |
| paragraphs = [] | |
| for p in embedded_soup.find_all('p'): | |
| p_text = p.get_text(separator=" ").strip() | |
| # Replace with spaces | |
| p_text = p_text.replace('\xa0', ' ').replace(' ', ' ') | |
| if p_text and len(p_text) > 10: # Skip very short paragraphs | |
| paragraphs.append(p_text) | |
| if paragraphs: | |
| result = "\n\n".join(paragraphs) | |
| # Strategy 2: Try to find paragraphs directly (fallback) | |
| if not result or len(result) < 100: | |
| decision_text = [] | |
| for paragraph in soup.find_all('p'): | |
| text = paragraph.get_text(separator="\n").strip() | |
| if not any(unwanted_text in text for unwanted_text in unwanted_texts): | |
| decision_text.append(text) | |
| result = "\n".join(decision_text).strip() | |
| # Strategy 3: If still nothing, try wordwrap div | |
| if not result or len(result) < 100: | |
| wordwrap = soup.find('div', class_='wordwrap') | |
| if wordwrap: | |
| result = wordwrap.get_text(separator="\n").strip() | |
| # Clean up the result | |
| if result: | |
| lines = result.split('\n') | |
| cleaned_lines = [ | |
| line.strip() for line in lines | |
| if line.strip() and len(line.strip()) > 5 | |
| and not any(unwanted in line for unwanted in unwanted_texts) | |
| ] | |
| result = '\n'.join(cleaned_lines) | |
| print(f"[DEBUG] Extracted {len(result)} characters from URL") | |
| if not result or len(result) < 100: | |
| raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.") | |
| return result | |
| def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]: | |
| """Parse document IDs from various input formats.""" | |
| if doc_ids is None: | |
| return [] | |
| if isinstance(doc_ids, list): | |
| return [str(id).strip('[]') for id in doc_ids] | |
| if isinstance(doc_ids, str): | |
| cleaned = doc_ids.strip('[]').replace(' ', '') | |
| if cleaned: | |
| return [id.strip() for id in cleaned.split(',')] | |
| return [] | |
| def get_links_html(doc_ids: Union[List, str, None]) -> str: | |
| """Generate HTML links for document IDs.""" | |
| parsed_ids = parse_doc_ids(doc_ids) | |
| if not parsed_ids: | |
| return "" | |
| links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})" | |
| for doc_id in parsed_ids] | |
| return ", ".join(links) | |
| def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]: | |
| """Parse legal position IDs.""" | |
| if lp_ids is None: | |
| return [] | |
| if isinstance(lp_ids, (str, int)): | |
| cleaned = str(lp_ids).strip('[]').replace(' ', '') | |
| if cleaned: | |
| return [cleaned] | |
| return [] | |
| def get_links_html_lp(lp_ids: Union[str, int, None]) -> str: | |
| """Generate HTML links for legal position IDs.""" | |
| parsed_ids = parse_lp_ids(lp_ids) | |
| if not parsed_ids: | |
| return "" | |
| links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})" | |
| for lp_id in parsed_ids] | |
| return ", ".join(links) | |
| def extract_json_from_text(text: str) -> Optional[Dict]: | |
| """Extract and parse JSON from text, handling markdown blocks and other noise.""" | |
| if not text: | |
| return None | |
| try: | |
| # 1. Try direct parsing | |
| return json.loads(text.strip()) | |
| except json.JSONDecodeError: | |
| pass | |
| # 2. Try to find JSON within markdown or other text | |
| text_to_parse = text.strip() | |
| # Remove markdown code blocks with triple backticks or triple single quotes | |
| for delimiter in ["```json", "'''json", "```", "'''"]: | |
| if delimiter in text_to_parse: | |
| try: | |
| parts = text_to_parse.split(delimiter) | |
| if len(parts) > 1: | |
| # Take the first content block after the delimiter | |
| candidate = parts[1].split(delimiter.replace("json", ""))[0].strip() | |
| if candidate: | |
| text_to_parse = candidate | |
| break | |
| except Exception: | |
| continue | |
| try: | |
| return json.loads(text_to_parse) | |
| except json.JSONDecodeError: | |
| pass | |
| # 3. Last resort: find the first { and last } | |
| # Try to balance braces to handle potential truncation or trailing noise | |
| start_idx = text_to_parse.find('{') | |
| if start_idx != -1: | |
| # Step backwards from the end to find the last valid-looking closing brace | |
| for end_idx in range(len(text_to_parse) - 1, start_idx, -1): | |
| if text_to_parse[end_idx] == '}': | |
| candidate = text_to_parse[start_idx:end_idx + 1] | |
| try: | |
| return json.loads(candidate) | |
| except json.JSONDecodeError: | |
| continue | |
| return None |