Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +66 -12
working_yolo_pipeline.py
CHANGED
|
@@ -2799,7 +2799,6 @@ import glob
|
|
| 2799 |
|
| 2800 |
|
| 2801 |
|
| 2802 |
-
|
| 2803 |
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2804 |
# if not os.path.exists(input_pdf_path):
|
| 2805 |
# print(f"β ERROR: File not found: {input_pdf_path}")
|
|
@@ -2834,12 +2833,25 @@ import glob
|
|
| 2834 |
# return None
|
| 2835 |
# print(f"β
Step 1 Complete ({time.time() - p1_start:.2f}s)")
|
| 2836 |
|
| 2837 |
-
# # --- Phase 2: Inference ---
|
| 2838 |
-
# print(f"\n[Step 2/5] Inference (
|
| 2839 |
# p2_start = time.time()
|
|
|
|
|
|
|
|
|
|
| 2840 |
# page_raw_predictions_list = run_inference_and_get_raw_words(
|
| 2841 |
-
#
|
| 2842 |
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2843 |
# if not page_raw_predictions_list:
|
| 2844 |
# print("β FAILED at Step 2: Inference returned no data.")
|
| 2845 |
# return None
|
|
@@ -2923,10 +2935,11 @@ import glob
|
|
| 2923 |
|
| 2924 |
|
| 2925 |
|
| 2926 |
-
def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None)
|
| 2927 |
if not os.path.exists(input_pdf_path):
|
| 2928 |
print(f"β ERROR: File not found: {input_pdf_path}")
|
| 2929 |
-
|
|
|
|
| 2930 |
|
| 2931 |
print("\n" + "#" * 80)
|
| 2932 |
print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
|
|
@@ -2954,9 +2967,40 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 2954 |
preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
|
| 2955 |
if not preprocessed_json_path_out:
|
| 2956 |
print("β FAILED at Step 1: Preprocessing returned None.")
|
| 2957 |
-
|
|
|
|
| 2958 |
print(f"β
Step 1 Complete ({time.time() - p1_start:.2f}s)")
|
| 2959 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2960 |
# --- Phase 2: Inference (MODIFIED) ---
|
| 2961 |
print(f"\n[Step 2/5] Inference (Custom Model)...")
|
| 2962 |
p2_start = time.time()
|
|
@@ -2978,7 +3022,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 2978 |
|
| 2979 |
if not page_raw_predictions_list:
|
| 2980 |
print("β FAILED at Step 2: Inference returned no data.")
|
| 2981 |
-
|
|
|
|
| 2982 |
|
| 2983 |
with open(raw_output_path, 'w', encoding='utf-8') as f:
|
| 2984 |
json.dump(page_raw_predictions_list, f, indent=4)
|
|
@@ -2992,7 +3037,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 2992 |
)
|
| 2993 |
if not structured_data_list:
|
| 2994 |
print("β FAILED at Step 3: BIO conversion failed.")
|
| 2995 |
-
|
|
|
|
| 2996 |
|
| 2997 |
print("... Correcting misalignments and linking context ...")
|
| 2998 |
structured_data_list = correct_misaligned_options(structured_data_list)
|
|
@@ -3005,7 +3051,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 3005 |
final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 3006 |
if not final_result:
|
| 3007 |
print("β FAILED at Step 4: Final formatting failed.")
|
| 3008 |
-
|
|
|
|
| 3009 |
print(f"β
Step 4 Complete ({time.time() - p4_start:.2f}s)")
|
| 3010 |
|
| 3011 |
# --- Phase 4.5: Question Type Classification ---
|
|
@@ -3038,7 +3085,8 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 3038 |
print(f"\nβΌοΈ FATAL PIPELINE EXCEPTION:")
|
| 3039 |
print(f"Error Message: {str(e)}")
|
| 3040 |
traceback.print_exc()
|
| 3041 |
-
|
|
|
|
| 3042 |
|
| 3043 |
# finally:
|
| 3044 |
# print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
|
|
@@ -3055,7 +3103,13 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
|
|
| 3055 |
print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
|
| 3056 |
print("#" * 80)
|
| 3057 |
|
| 3058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3059 |
|
| 3060 |
|
| 3061 |
|
|
|
|
| 2799 |
|
| 2800 |
|
| 2801 |
|
|
|
|
| 2802 |
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2803 |
# if not os.path.exists(input_pdf_path):
|
| 2804 |
# print(f"β ERROR: File not found: {input_pdf_path}")
|
|
|
|
| 2833 |
# return None
|
| 2834 |
# print(f"β
Step 1 Complete ({time.time() - p1_start:.2f}s)")
|
| 2835 |
|
| 2836 |
+
# # --- Phase 2: Inference (MODIFIED) ---
|
| 2837 |
+
# print(f"\n[Step 2/5] Inference (Custom Model)...")
|
| 2838 |
# p2_start = time.time()
|
| 2839 |
+
|
| 2840 |
+
# # -------------------------------------------------------------------------
|
| 2841 |
+
# # --- COMMENTED OUT OLD LAYOUTLMV3 CALL FOR REVERSION ---
|
| 2842 |
# page_raw_predictions_list = run_inference_and_get_raw_words(
|
| 2843 |
+
# input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
|
| 2844 |
# )
|
| 2845 |
+
# # -------------------------------------------------------------------------
|
| 2846 |
+
|
| 2847 |
+
# # --- NEW CUSTOM MODEL CALL ---
|
| 2848 |
+
# # Note: We only pass the JSON path because the custom function
|
| 2849 |
+
# # doesn't need to re-read the PDF or use the layoutlmv3 model path.
|
| 2850 |
+
# # page_raw_predictions_list = run_custom_inference_and_get_raw_words(
|
| 2851 |
+
# # preprocessed_json_path_out
|
| 2852 |
+
# # )
|
| 2853 |
+
# # -----------------------------
|
| 2854 |
+
|
| 2855 |
# if not page_raw_predictions_list:
|
| 2856 |
# print("β FAILED at Step 2: Inference returned no data.")
|
| 2857 |
# return None
|
|
|
|
| 2935 |
|
| 2936 |
|
| 2937 |
|
| 2938 |
+
def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None):
|
| 2939 |
if not os.path.exists(input_pdf_path):
|
| 2940 |
print(f"β ERROR: File not found: {input_pdf_path}")
|
| 2941 |
+
yield {"status": "error", "message": f"File not found: {input_pdf_path}"}
|
| 2942 |
+
return
|
| 2943 |
|
| 2944 |
print("\n" + "#" * 80)
|
| 2945 |
print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
|
|
|
|
| 2967 |
preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
|
| 2968 |
if not preprocessed_json_path_out:
|
| 2969 |
print("β FAILED at Step 1: Preprocessing returned None.")
|
| 2970 |
+
yield {"status": "error", "message": "FAILED at Step 1: Preprocessing returned None."}
|
| 2971 |
+
return
|
| 2972 |
print(f"β
Step 1 Complete ({time.time() - p1_start:.2f}s)")
|
| 2973 |
|
| 2974 |
+
# ============================================================
|
| 2975 |
+
# β±οΈ YIELD ESTIMATION DATA IMMEDIATELY AFTER PHASE 1
|
| 2976 |
+
# ============================================================
|
| 2977 |
+
try:
|
| 2978 |
+
import fitz
|
| 2979 |
+
doc = fitz.open(input_pdf_path)
|
| 2980 |
+
page_count = doc.page_count
|
| 2981 |
+
doc.close()
|
| 2982 |
+
except Exception as e:
|
| 2983 |
+
print(f"β οΈ Could not get page count for estimation: {e}")
|
| 2984 |
+
page_count = 0
|
| 2985 |
+
|
| 2986 |
+
global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
|
| 2987 |
+
|
| 2988 |
+
# Safely get the globals in case they weren't initialized
|
| 2989 |
+
fig_count = globals().get('GLOBAL_FIGURE_COUNT', 0)
|
| 2990 |
+
eq_count = globals().get('GLOBAL_EQUATION_COUNT', 0)
|
| 2991 |
+
|
| 2992 |
+
# Adjust these multipliers based on your pipeline's actual processing speed
|
| 2993 |
+
est_time_seconds = (page_count * 5) + (fig_count * 2) + (eq_count * 1)
|
| 2994 |
+
|
| 2995 |
+
yield {
|
| 2996 |
+
"status": "estimating",
|
| 2997 |
+
"page_count": page_count,
|
| 2998 |
+
"figure_count": fig_count,
|
| 2999 |
+
"equation_count": eq_count,
|
| 3000 |
+
"estimated_time_seconds": est_time_seconds
|
| 3001 |
+
}
|
| 3002 |
+
# ============================================================
|
| 3003 |
+
|
| 3004 |
# --- Phase 2: Inference (MODIFIED) ---
|
| 3005 |
print(f"\n[Step 2/5] Inference (Custom Model)...")
|
| 3006 |
p2_start = time.time()
|
|
|
|
| 3022 |
|
| 3023 |
if not page_raw_predictions_list:
|
| 3024 |
print("β FAILED at Step 2: Inference returned no data.")
|
| 3025 |
+
yield {"status": "error", "message": "FAILED at Step 2: Inference returned no data."}
|
| 3026 |
+
return
|
| 3027 |
|
| 3028 |
with open(raw_output_path, 'w', encoding='utf-8') as f:
|
| 3029 |
json.dump(page_raw_predictions_list, f, indent=4)
|
|
|
|
| 3037 |
)
|
| 3038 |
if not structured_data_list:
|
| 3039 |
print("β FAILED at Step 3: BIO conversion failed.")
|
| 3040 |
+
yield {"status": "error", "message": "FAILED at Step 3: BIO conversion failed."}
|
| 3041 |
+
return
|
| 3042 |
|
| 3043 |
print("... Correcting misalignments and linking context ...")
|
| 3044 |
structured_data_list = correct_misaligned_options(structured_data_list)
|
|
|
|
| 3051 |
final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 3052 |
if not final_result:
|
| 3053 |
print("β FAILED at Step 4: Final formatting failed.")
|
| 3054 |
+
yield {"status": "error", "message": "FAILED at Step 4: Final formatting failed."}
|
| 3055 |
+
return
|
| 3056 |
print(f"β
Step 4 Complete ({time.time() - p4_start:.2f}s)")
|
| 3057 |
|
| 3058 |
# --- Phase 4.5: Question Type Classification ---
|
|
|
|
| 3085 |
print(f"\nβΌοΈ FATAL PIPELINE EXCEPTION:")
|
| 3086 |
print(f"Error Message: {str(e)}")
|
| 3087 |
traceback.print_exc()
|
| 3088 |
+
yield {"status": "error", "message": str(e)}
|
| 3089 |
+
return
|
| 3090 |
|
| 3091 |
# finally:
|
| 3092 |
# print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
|
|
|
|
| 3103 |
print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
|
| 3104 |
print("#" * 80)
|
| 3105 |
|
| 3106 |
+
# ============================================================
|
| 3107 |
+
# π YIELD FINAL RESULT
|
| 3108 |
+
# ============================================================
|
| 3109 |
+
yield {
|
| 3110 |
+
"status": "complete",
|
| 3111 |
+
"result": final_result
|
| 3112 |
+
}
|
| 3113 |
|
| 3114 |
|
| 3115 |
|