Spaces:

MikeMai
/

PO_Extractor

Sleeping

App Files Files Community

MikeMai commited on Apr 14, 2025

Commit

3c93df8

verified ·

1 Parent(s): 20e903b

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -152

app.py CHANGED Viewed

@@ -16,9 +16,6 @@ import re
 import logging
-from pydantic import BaseModel, Field, ValidationError, RootModel
-from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
@@ -250,28 +247,7 @@ def process_long_table(rows):
         table_data.append(row_data)
-    # Filter out rows where the "序号" column contains non-numeric values
-    filtered_table_data = []
-    for row in table_data:
-        # Check potential serial number columns (use both Chinese and English variants)
-        serial_number = None
-        for column in row:
-            if any(term in column for term in ["序号"]):
-                serial_number = row[column]
-                break
-        # If we found a serial number column, check if its value is numeric
-        if serial_number is not None:
-            # Strip any non-numeric characters and check if there's still a value
-            # This keeps values like "1", "2." etc. but filters out "No." or other text
-            cleaned_number = re.sub(r'[^\d]', '', serial_number)
-            if cleaned_number:  # If there are any digits left, keep the row
-                filtered_table_data.append(row)
-        else:
-            # If we couldn't find a serial number column, keep the row
-            filtered_table_data.append(row)
-    return filtered_table_data
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
@@ -426,6 +402,29 @@ Contract data in JSON format:""" + f"""
         temperature=0.5,
     )
     think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
     if think_text:
         print(f"Thought Process: {think_text}")
@@ -442,110 +441,50 @@ Contract data in JSON format:""" + f"""
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
-def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
-    """
-    Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
-    Retries up to 3 times with error feedback if output is not valid JSON.
-    """
-    # Pydantic schema
-    class PriceItem(BaseModel):
-        序号: str
-        名称: str
-        名称_英文: str = Field(..., alias="名称(英文)")
-        品牌: str
-        规格: str
-        所属机型: str
-        采购数量: str
-        单位: str
-        单价: str
-        总价: str
-        几郎单价: str
-        几郎总额: str
-        备注: str
-        计划来源: str
-        其他: dict = Field(default_factory=dict, alias="其他")
-    class PriceListModel(BaseModel):
-        items: List[PriceItem]
-    base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
-有时候第一行是表头，有时候是数据行，只输入数据行。请注意，输出的 JSON 需要符合以下格式要求：
-# 输出格式要求：
-每个条目输出以下字段：
-- 序号
-- 名称：只填中文
-- 名称(英文)：只填英文
-- 品牌
-- 规格
-- 所属机型
-- 采购数量
-- 单位
-- 单价: 只填数字
-- 总价: 只填数字
-- 几郎单价: 只填数字
-- 几郎总额: 只填数字
-- 备注
-- 计划来源
-- 其他：如果有以上以外的字段就以list的形式写在其他里 （"其他": "key1": "value1", "key2":"value2"）,如果没有就给一个空的list
-请确保输出的 JSON 是有效的，且字段名称与输入的字段名称一致。请注意，字段名称可能会有不同的拼写方式，请根据上下文进行判断。
-请确保输出的条目数量与输入的列表数量一致。
-# 原始价格表：
-{price_list}"""
-    messages = [{"role": "user", "content": base_prompt}]
-    client = OpenAI(
-        base_url="https://router.huggingface.co/novita",
-        api_key=HF_API_KEY,
-    )
-    for attempt in range(3):
-        print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
-        try:
-            response = client.chat.completions.create(
-                model="deepseek/deepseek-r1-distill-qwen-14b",
-                messages=messages,
-            )
-            raw = response.choices[0].message.content
-            # Strip out LLM artifacts
-            raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
-            raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
-            # Wrap the raw JSON in a proper structure if it's a list
-            if raw.strip().startswith('['):
-                raw = '{"items": ' + raw + '}'
-            validated = PriceListModel.model_validate_json(raw)
-            price_list_json = validated.model_dump(by_alias=True)["items"]
-            if save_json:
-                with open(json_name, "w", encoding="utf-8") as f:
-                    json.dump(price_list_json, f, ensure_ascii=False, indent=4)
-                print(f"✅ Saved to {json_name}")
-            return price_list_json
-        except ValidationError as ve:
-            error_msg = f"Pydantic validation error: {ve}"
-        except Exception as e:
-            error_msg = f"Unexpected error: {e}"
-        print(f"❌ {error_msg}")
-        messages.append({
             "role": "user",
-            "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
-        })
-    print("⚠️ Failed after 3 attempts.")
-    return raw
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
@@ -568,7 +507,7 @@ def json_to_excel(contract_summary, json_data, excel_path):
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
-    """Processes a single .docx file, extracts tables, formats with OpenAI, and returns combined JSON data."""
     if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
         raise ValueError(f"Invalid file: {docx_path}")
@@ -579,42 +518,28 @@ def extract_po(docx_path):
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
     xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
-    xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
-    extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
-    # Step 3: Process JSON with OpenAI to get structured output
-    print("Processing Contract Summary data with AI...")
     contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
-    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=False, json_filename=contract_summary_filename)
-    # Find the last long table (excluding summary tables)
-    print("Processing Price List data with AI...")
-    long_tables = [
-        table for key, table in json.loads(extracted_data).items()
-        if "long_table" in key and "summary" not in key
-    ]
-    last_long_table = long_tables[-1] if long_tables else {}
-    # Generate the price list filename in the same folder as the document
-    price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
-    # Process the price list and save it to a JSON file
-    price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
-    # Step 4: Combine contract summary and long table data into a single JSON object
-    print("Combining AI Generated JSON with Extracted Data...")
-    combined_data = {
-        "contract_summary": json.loads(json.loads(contract_summary)),
-        "price_list": price_list
-    }
     # Logging
     log = f"""Results:
@@ -622,20 +547,20 @@ def extract_po(docx_path):
     RAW Extracted Data: {extracted_data},
-    Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
     print(log)
     logging.info(f"""{log}""")
-    return combined_data
 # Example Usage
 # extract_po("test-contract-converted.docx")
 # extract_po("test-contract.docx")
-# print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------
 import gradio as gr
@@ -645,10 +570,9 @@ interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",
     inputs=gr.File(label="买卖合同 （.docx）"),
-    outputs=gr.Json(label="提取结果"),
     flagging_mode="never",
     theme=Base()
 )
 interface.launch()

 import logging
 HF_API_KEY = os.getenv("HF_API_KEY")
         table_data.append(row_data)
+    return table_data
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
         temperature=0.5,
     )
+    # Deepseek V3 --------------------------------
+    # client = OpenAI(
+    #     base_url="https://router.huggingface.co/novita",
+    #     api_key=HF_API_KEY,
+    # )
+    # completion = client.chat.completions.create(
+    #     model="deepseek/deepseek_v3",
+    #     messages=messages,
+    #     temperature=0.1,
+    # )
+    # Qwen 2.5 7B --------------------------------
+    # client = OpenAI(
+    #     base_url="https://router.huggingface.co/together",
+    #     api_key=HF_API_KEY,
+    # )
+    # completion = client.chat.completions.create(
+    #     model="Qwen/Qwen2.5-7B-Instruct-Turbo",
+    #     messages=messages,
+    # )
     think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
     if think_text:
         print(f"Thought Process: {think_text}")
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
+def deepseek_extract_price_list(json_data):
+    """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
+    # Step 1: Convert JSON string to Python dictionary
+    contract_data = json.loads(json_data)
+    # Step 2: Remove keys that contain "long_table"
+    filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" in key}
+    # Step 3: Convert back to JSON string (if needed)
+    json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
+    prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
+# Response Format
+Return the extracted information as a CSV in the exact format shown below:
+物料名称, 物料名称(英文), 物料规格, 采购数量, 单位, 单价, 计划号
+JSON data:""" + f"""
+{json_output}"""
+    messages = [
+        {
             "role": "user",
+            "content": prompt
+        }
+    ]
+    client = OpenAI(
+	    base_url="https://router.huggingface.co/novita",
+	    api_key=HF_API_KEY,
+    )
+    completion = client.chat.completions.create(
+        model="deepseek/deepseek-r1-distill-qwen-14b",
+        messages=messages,
+    )
+    price_list = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL)
+    price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
+    """Processes a single .docx file, extracts tables, formats with OpenAI, and saves as an Excel file."""
     if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
         raise ValueError(f"Invalid file: {docx_path}")
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
     xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
+    xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
+    extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
+    # Step 2: Process JSON with OpenAI to get structured output
+    print("Processing JSON data with AI...")
     contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
+    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
+    # Step 3: Save formatted data as Excel
+    print("Converting AI Generated JSON to Excel...")
+    excel_output_path = os.path.splitext(docx_path)[0] + ".xlsx"
+    json_to_excel(contract_summary, extracted_data, excel_output_path)
+    print(f"Excel file saved at: {excel_output_path}")
     # Logging
     log = f"""Results:
     RAW Extracted Data: {extracted_data},
+    XML Preview: {xml_file[:1000]}"""
     print(log)
     logging.info(f"""{log}""")
+    return excel_output_path
 # Example Usage
 # extract_po("test-contract-converted.docx")
 # extract_po("test-contract.docx")
 # Gradio Interface ------------------------------
 import gradio as gr
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",
     inputs=gr.File(label="买卖合同 （.docx）"),
+    outputs=gr.File(label="数据提取结果 （.xlsx）"),
     flagging_mode="never",
     theme=Base()
 )
 interface.launch()