Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Mar 5

Commit

9d29356

verified ·

1 Parent(s): d1c2005

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +66 -12

working_yolo_pipeline.py CHANGED Viewed

@@ -2799,7 +2799,6 @@ import glob
 # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
 #     if not os.path.exists(input_pdf_path):
 #         print(f"❌ ERROR: File not found: {input_pdf_path}")
@@ -2834,12 +2833,25 @@ import glob
 #             return None
 #         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
-#         # --- Phase 2: Inference ---
-#         print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
 #         p2_start = time.time()
 #         page_raw_predictions_list = run_inference_and_get_raw_words(
-#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
 #         )
 #         if not page_raw_predictions_list:
 #             print("❌ FAILED at Step 2: Inference returned no data.")
 #             return None
@@ -2923,10 +2935,11 @@ import glob
-def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
-        return None
     print("\n" + "#" * 80)
     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
@@ -2954,9 +2967,40 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
         if not preprocessed_json_path_out:
             print("❌ FAILED at Step 1: Preprocessing returned None.")
-            return None
         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
         # --- Phase 2: Inference (MODIFIED) ---
         print(f"\n[Step 2/5] Inference (Custom Model)...")
         p2_start = time.time()
@@ -2978,7 +3022,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         if not page_raw_predictions_list:
             print("❌ FAILED at Step 2: Inference returned no data.")
-            return None
         with open(raw_output_path, 'w', encoding='utf-8') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
@@ -2992,7 +3037,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         )
         if not structured_data_list:
             print("❌ FAILED at Step 3: BIO conversion failed.")
-            return None
         print("... Correcting misalignments and linking context ...")
         structured_data_list = correct_misaligned_options(structured_data_list)
@@ -3005,7 +3051,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
         if not final_result:
             print("❌ FAILED at Step 4: Final formatting failed.")
-            return None
         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
         # --- Phase 4.5: Question Type Classification ---
@@ -3038,7 +3085,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
         print(f"Error Message: {str(e)}")
         traceback.print_exc()
-        return None
     # finally:
     #     print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
@@ -3055,7 +3103,13 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
     print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
     print("#" * 80)
-    return final_result

 # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
 #     if not os.path.exists(input_pdf_path):
 #         print(f"❌ ERROR: File not found: {input_pdf_path}")
 #             return None
 #         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
+#         # --- Phase 2: Inference (MODIFIED) ---
+#         print(f"\n[Step 2/5] Inference (Custom Model)...")
 #         p2_start = time.time()
+#         # -------------------------------------------------------------------------
+#         # --- COMMENTED OUT OLD LAYOUTLMV3 CALL FOR REVERSION ---
 #         page_raw_predictions_list = run_inference_and_get_raw_words(
+#            input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
 #         )
+#         # -------------------------------------------------------------------------
+#         # --- NEW CUSTOM MODEL CALL ---
+#         # Note: We only pass the JSON path because the custom function
+#         # doesn't need to re-read the PDF or use the layoutlmv3 model path.
+#         # page_raw_predictions_list = run_custom_inference_and_get_raw_words(
+#         #     preprocessed_json_path_out
+#         # )
+#         # -----------------------------
 #         if not page_raw_predictions_list:
 #             print("❌ FAILED at Step 2: Inference returned no data.")
 #             return None
+def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None):
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
+        yield {"status": "error", "message": f"File not found: {input_pdf_path}"}
+        return
     print("\n" + "#" * 80)
     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
         if not preprocessed_json_path_out:
             print("❌ FAILED at Step 1: Preprocessing returned None.")
+            yield {"status": "error", "message": "FAILED at Step 1: Preprocessing returned None."}
+            return
         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
+        # ============================================================
+        # ⏱️ YIELD ESTIMATION DATA IMMEDIATELY AFTER PHASE 1
+        # ============================================================
+        try:
+            import fitz
+            doc = fitz.open(input_pdf_path)
+            page_count = doc.page_count
+            doc.close()
+        except Exception as e:
+            print(f"⚠️ Could not get page count for estimation: {e}")
+            page_count = 0
+        global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+        # Safely get the globals in case they weren't initialized
+        fig_count = globals().get('GLOBAL_FIGURE_COUNT', 0)
+        eq_count = globals().get('GLOBAL_EQUATION_COUNT', 0)
+        # Adjust these multipliers based on your pipeline's actual processing speed
+        est_time_seconds = (page_count * 5) + (fig_count * 2) + (eq_count * 1)
+        yield {
+            "status": "estimating",
+            "page_count": page_count,
+            "figure_count": fig_count,
+            "equation_count": eq_count,
+            "estimated_time_seconds": est_time_seconds
+        }
+        # ============================================================
         # --- Phase 2: Inference (MODIFIED) ---
         print(f"\n[Step 2/5] Inference (Custom Model)...")
         p2_start = time.time()
         if not page_raw_predictions_list:
             print("❌ FAILED at Step 2: Inference returned no data.")
+            yield {"status": "error", "message": "FAILED at Step 2: Inference returned no data."}
+            return
         with open(raw_output_path, 'w', encoding='utf-8') as f:
             json.dump(page_raw_predictions_list, f, indent=4)
         )
         if not structured_data_list:
             print("❌ FAILED at Step 3: BIO conversion failed.")
+            yield {"status": "error", "message": "FAILED at Step 3: BIO conversion failed."}
+            return
         print("... Correcting misalignments and linking context ...")
         structured_data_list = correct_misaligned_options(structured_data_list)
         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
         if not final_result:
             print("❌ FAILED at Step 4: Final formatting failed.")
+            yield {"status": "error", "message": "FAILED at Step 4: Final formatting failed."}
+            return
         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
         # --- Phase 4.5: Question Type Classification ---
         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
         print(f"Error Message: {str(e)}")
         traceback.print_exc()
+        yield {"status": "error", "message": str(e)}
+        return
     # finally:
     #     print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
     print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
     print("#" * 80)
+    # ============================================================
+    # 🏁 YIELD FINAL RESULT
+    # ============================================================
+    yield {
+        "status": "complete",
+        "result": final_result
+    }