heerjtdev commited on
Commit
9d29356
Β·
verified Β·
1 Parent(s): d1c2005

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +66 -12
working_yolo_pipeline.py CHANGED
@@ -2799,7 +2799,6 @@ import glob
2799
 
2800
 
2801
 
2802
-
2803
  # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2804
  # if not os.path.exists(input_pdf_path):
2805
  # print(f"❌ ERROR: File not found: {input_pdf_path}")
@@ -2834,12 +2833,25 @@ import glob
2834
  # return None
2835
  # print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2836
 
2837
- # # --- Phase 2: Inference ---
2838
- # print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
2839
  # p2_start = time.time()
 
 
 
2840
  # page_raw_predictions_list = run_inference_and_get_raw_words(
2841
- # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2842
  # )
 
 
 
 
 
 
 
 
 
 
2843
  # if not page_raw_predictions_list:
2844
  # print("❌ FAILED at Step 2: Inference returned no data.")
2845
  # return None
@@ -2923,10 +2935,11 @@ import glob
2923
 
2924
 
2925
 
2926
- def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2927
  if not os.path.exists(input_pdf_path):
2928
  print(f"❌ ERROR: File not found: {input_pdf_path}")
2929
- return None
 
2930
 
2931
  print("\n" + "#" * 80)
2932
  print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
@@ -2954,9 +2967,40 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2954
  preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2955
  if not preprocessed_json_path_out:
2956
  print("❌ FAILED at Step 1: Preprocessing returned None.")
2957
- return None
 
2958
  print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960
  # --- Phase 2: Inference (MODIFIED) ---
2961
  print(f"\n[Step 2/5] Inference (Custom Model)...")
2962
  p2_start = time.time()
@@ -2978,7 +3022,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2978
 
2979
  if not page_raw_predictions_list:
2980
  print("❌ FAILED at Step 2: Inference returned no data.")
2981
- return None
 
2982
 
2983
  with open(raw_output_path, 'w', encoding='utf-8') as f:
2984
  json.dump(page_raw_predictions_list, f, indent=4)
@@ -2992,7 +3037,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
2992
  )
2993
  if not structured_data_list:
2994
  print("❌ FAILED at Step 3: BIO conversion failed.")
2995
- return None
 
2996
 
2997
  print("... Correcting misalignments and linking context ...")
2998
  structured_data_list = correct_misaligned_options(structured_data_list)
@@ -3005,7 +3051,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
3005
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
3006
  if not final_result:
3007
  print("❌ FAILED at Step 4: Final formatting failed.")
3008
- return None
 
3009
  print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
3010
 
3011
  # --- Phase 4.5: Question Type Classification ---
@@ -3038,7 +3085,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
3038
  print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
3039
  print(f"Error Message: {str(e)}")
3040
  traceback.print_exc()
3041
- return None
 
3042
 
3043
  # finally:
3044
  # print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
@@ -3055,7 +3103,13 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
3055
  print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
3056
  print("#" * 80)
3057
 
3058
- return final_result
 
 
 
 
 
 
3059
 
3060
 
3061
 
 
2799
 
2800
 
2801
 
 
2802
  # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2803
  # if not os.path.exists(input_pdf_path):
2804
  # print(f"❌ ERROR: File not found: {input_pdf_path}")
 
2833
  # return None
2834
  # print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2835
 
2836
+ # # --- Phase 2: Inference (MODIFIED) ---
2837
+ # print(f"\n[Step 2/5] Inference (Custom Model)...")
2838
  # p2_start = time.time()
2839
+
2840
+ # # -------------------------------------------------------------------------
2841
+ # # --- COMMENTED OUT OLD LAYOUTLMV3 CALL FOR REVERSION ---
2842
  # page_raw_predictions_list = run_inference_and_get_raw_words(
2843
+ # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2844
  # )
2845
+ # # -------------------------------------------------------------------------
2846
+
2847
+ # # --- NEW CUSTOM MODEL CALL ---
2848
+ # # Note: We only pass the JSON path because the custom function
2849
+ # # doesn't need to re-read the PDF or use the layoutlmv3 model path.
2850
+ # # page_raw_predictions_list = run_custom_inference_and_get_raw_words(
2851
+ # # preprocessed_json_path_out
2852
+ # # )
2853
+ # # -----------------------------
2854
+
2855
  # if not page_raw_predictions_list:
2856
  # print("❌ FAILED at Step 2: Inference returned no data.")
2857
  # return None
 
2935
 
2936
 
2937
 
2938
+ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None):
2939
  if not os.path.exists(input_pdf_path):
2940
  print(f"❌ ERROR: File not found: {input_pdf_path}")
2941
+ yield {"status": "error", "message": f"File not found: {input_pdf_path}"}
2942
+ return
2943
 
2944
  print("\n" + "#" * 80)
2945
  print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
 
2967
  preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2968
  if not preprocessed_json_path_out:
2969
  print("❌ FAILED at Step 1: Preprocessing returned None.")
2970
+ yield {"status": "error", "message": "FAILED at Step 1: Preprocessing returned None."}
2971
+ return
2972
  print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2973
 
2974
+ # ============================================================
2975
+ # ⏱️ YIELD ESTIMATION DATA IMMEDIATELY AFTER PHASE 1
2976
+ # ============================================================
2977
+ try:
2978
+ import fitz
2979
+ doc = fitz.open(input_pdf_path)
2980
+ page_count = doc.page_count
2981
+ doc.close()
2982
+ except Exception as e:
2983
+ print(f"⚠️ Could not get page count for estimation: {e}")
2984
+ page_count = 0
2985
+
2986
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2987
+
2988
+ # Safely get the globals in case they weren't initialized
2989
+ fig_count = globals().get('GLOBAL_FIGURE_COUNT', 0)
2990
+ eq_count = globals().get('GLOBAL_EQUATION_COUNT', 0)
2991
+
2992
+ # Adjust these multipliers based on your pipeline's actual processing speed
2993
+ est_time_seconds = (page_count * 5) + (fig_count * 2) + (eq_count * 1)
2994
+
2995
+ yield {
2996
+ "status": "estimating",
2997
+ "page_count": page_count,
2998
+ "figure_count": fig_count,
2999
+ "equation_count": eq_count,
3000
+ "estimated_time_seconds": est_time_seconds
3001
+ }
3002
+ # ============================================================
3003
+
3004
  # --- Phase 2: Inference (MODIFIED) ---
3005
  print(f"\n[Step 2/5] Inference (Custom Model)...")
3006
  p2_start = time.time()
 
3022
 
3023
  if not page_raw_predictions_list:
3024
  print("❌ FAILED at Step 2: Inference returned no data.")
3025
+ yield {"status": "error", "message": "FAILED at Step 2: Inference returned no data."}
3026
+ return
3027
 
3028
  with open(raw_output_path, 'w', encoding='utf-8') as f:
3029
  json.dump(page_raw_predictions_list, f, indent=4)
 
3037
  )
3038
  if not structured_data_list:
3039
  print("❌ FAILED at Step 3: BIO conversion failed.")
3040
+ yield {"status": "error", "message": "FAILED at Step 3: BIO conversion failed."}
3041
+ return
3042
 
3043
  print("... Correcting misalignments and linking context ...")
3044
  structured_data_list = correct_misaligned_options(structured_data_list)
 
3051
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
3052
  if not final_result:
3053
  print("❌ FAILED at Step 4: Final formatting failed.")
3054
+ yield {"status": "error", "message": "FAILED at Step 4: Final formatting failed."}
3055
+ return
3056
  print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
3057
 
3058
  # --- Phase 4.5: Question Type Classification ---
 
3085
  print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
3086
  print(f"Error Message: {str(e)}")
3087
  traceback.print_exc()
3088
+ yield {"status": "error", "message": str(e)}
3089
+ return
3090
 
3091
  # finally:
3092
  # print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
 
3103
  print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
3104
  print("#" * 80)
3105
 
3106
+ # ============================================================
3107
+ # 🏁 YIELD FINAL RESULT
3108
+ # ============================================================
3109
+ yield {
3110
+ "status": "complete",
3111
+ "result": final_result
3112
+ }
3113
 
3114
 
3115