Spaces:

DocSA
/

LP_2-AI_Assistant

Running

App Files Files Community

LP_2-AI_Assistant / utils.py

DocUA

feat: enhance text cleaning utility to remove HTML tags and entities, update documentation, and requirements.

3418cd7 about 1 month ago

raw

history blame contribute delete

7.99 kB

	import unicodedata
	import json
	import requests
	import re
	from bs4 import BeautifulSoup
	from typing import Union, List, Dict, Optional

	def clean_text(text: str) -> str:
	"""Clean text from problematic characters."""
	if not text:
	return text

	replacements = {
	''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"',
	'–': '-', '—': '-', '…': '...',
	'\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"',
	'\u2013': '-', '\u2014': '-', '\u2026': '...',
	'\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'",
	'\u0301': "", '\u0060': "'", '\u00B4': "'"
	}

	try:
	# Normalize to NFKD and handle character replacements
	text = unicodedata.normalize('NFKD', text)
	# Handle character replacements
	for old, new in replacements.items():
	text = text.replace(old, new)

	# Remove HTML tags and entities
	# Specifically targeting </p> <p> and other remnants
	text = re.sub(r'</p>\s*<p>', ' ', text, flags=re.IGNORECASE)
	text = re.sub(r'<[^>]+>', ' ', text)

	# Handle common HTML entities
	entities = {
	' ': ' ', '"': '"', '&': '&',
	'<': '<', '>': '>', ''': "'"
	}
	for ent, rep in entities.items():
	text = text.replace(ent, rep)

	# Remove control characters and normalize whitespace
	text = ' '.join(text.split())
	text = ''.join(char for char in text
	if not unicodedata.category(char).startswith('C'))
	return text
	except Exception as e:
	print(f"Error in clean_text: {str(e)}")
	return text

	def extract_court_decision_text(url: str) -> str:
	"""Extract text from court decision URL - специфічно для reyestr.court.gov.ua."""
	try:
	# Add headers and timeout for better reliability
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	except requests.RequestException as e:
	raise Exception(f"Помилка при завантаженні URL: {str(e)}")

	soup = BeautifulSoup(response.content, 'html.parser')

	unwanted_texts = [
	"Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
	"З метою упередження перешкоджанню стабільній роботі Реєстру"
	]

	result = ""

	# Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific)
	txtdepository = soup.find('textarea', id='txtdepository')
	if txtdepository:
	# The textarea contains HTML content as text
	embedded_html = txtdepository.get_text()
	# Parse the embedded HTML
	embedded_soup = BeautifulSoup(embedded_html, 'html.parser')
	# Extract text from paragraphs
	paragraphs = []
	for p in embedded_soup.find_all('p'):
	p_text = p.get_text(separator=" ").strip()
	# Replace   with spaces
	p_text = p_text.replace('\xa0', ' ').replace(' ', ' ')
	if p_text and len(p_text) > 10: # Skip very short paragraphs
	paragraphs.append(p_text)
	if paragraphs:
	result = "\n\n".join(paragraphs)

	# Strategy 2: Try to find paragraphs directly (fallback)
	if not result or len(result) < 100:
	decision_text = []
	for paragraph in soup.find_all('p'):
	text = paragraph.get_text(separator="\n").strip()
	if not any(unwanted_text in text for unwanted_text in unwanted_texts):
	decision_text.append(text)
	result = "\n".join(decision_text).strip()

	# Strategy 3: If still nothing, try wordwrap div
	if not result or len(result) < 100:
	wordwrap = soup.find('div', class_='wordwrap')
	if wordwrap:
	result = wordwrap.get_text(separator="\n").strip()

	# Clean up the result
	if result:
	lines = result.split('\n')
	cleaned_lines = [
	line.strip() for line in lines
	if line.strip() and len(line.strip()) > 5
	and not any(unwanted in line for unwanted in unwanted_texts)
	]
	result = '\n'.join(cleaned_lines)

	print(f"[DEBUG] Extracted {len(result)} characters from URL")

	if not result or len(result) < 100:
	raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.")

	return result

	def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]:
	"""Parse document IDs from various input formats."""
	if doc_ids is None:
	return []
	if isinstance(doc_ids, list):
	return [str(id).strip('[]') for id in doc_ids]
	if isinstance(doc_ids, str):
	cleaned = doc_ids.strip('[]').replace(' ', '')
	if cleaned:
	return [id.strip() for id in cleaned.split(',')]
	return []

	def get_links_html(doc_ids: Union[List, str, None]) -> str:
	"""Generate HTML links for document IDs."""
	parsed_ids = parse_doc_ids(doc_ids)
	if not parsed_ids:
	return ""
	links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
	for doc_id in parsed_ids]
	return ", ".join(links)

	def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]:
	"""Parse legal position IDs."""
	if lp_ids is None:
	return []
	if isinstance(lp_ids, (str, int)):
	cleaned = str(lp_ids).strip('[]').replace(' ', '')
	if cleaned:
	return [cleaned]
	return []

	def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
	"""Generate HTML links for legal position IDs."""
	parsed_ids = parse_lp_ids(lp_ids)
	if not parsed_ids:
	return ""
	links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
	for lp_id in parsed_ids]
	return ", ".join(links)

	def extract_json_from_text(text: str) -> Optional[Dict]:
	"""Extract and parse JSON from text, handling markdown blocks and other noise."""
	if not text:
	return None

	try:
	# 1. Try direct parsing
	return json.loads(text.strip())
	except json.JSONDecodeError:
	pass

	# 2. Try to find JSON within markdown or other text
	text_to_parse = text.strip()

	# Remove markdown code blocks with triple backticks or triple single quotes
	for delimiter in ["```json", "'''json", "```", "'''"]:
	if delimiter in text_to_parse:
	try:
	parts = text_to_parse.split(delimiter)
	if len(parts) > 1:
	# Take the first content block after the delimiter
	candidate = parts[1].split(delimiter.replace("json", ""))[0].strip()
	if candidate:
	text_to_parse = candidate
	break
	except Exception:
	continue

	try:
	return json.loads(text_to_parse)
	except json.JSONDecodeError:
	pass

	# 3. Last resort: find the first { and last }
	# Try to balance braces to handle potential truncation or trailing noise
	start_idx = text_to_parse.find('{')
	if start_idx != -1:
	# Step backwards from the end to find the last valid-looking closing brace
	for end_idx in range(len(text_to_parse) - 1, start_idx, -1):
	if text_to_parse[end_idx] == '}':
	candidate = text_to_parse[start_idx:end_idx + 1]
	try:
	return json.loads(candidate)
	except json.JSONDecodeError:
	continue

	return None