| import json |
| import random |
|
|
| import gradio as gr |
| import pymupdf |
|
|
|
|
| def parse_pdf(file_path: str): |
| doc = pymupdf.open(file_path) |
|
|
| report_run_date_time = None |
| transportation_provider_name = None |
|
|
| rows = [] |
| row = {} |
| for page in doc: |
| text_page = page.get_textpage() |
|
|
| state = None |
| for block in text_page.extractBLOCKS(): |
| x0, y0, x1, y1, text = block[:5] |
| text: str |
| text = text.strip() |
|
|
| if text.startswith("Report Run Date Time"): |
| lines = text.split("\n") |
| report_run_date_time = lines[1] |
| transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip() |
| elif "Fidelis Agent" in text: |
| state = "page_no" |
| elif state == "page_no": |
| state = "row_1" |
| elif state == "row_1": |
| lines = text.split("\n") |
| if len(lines) == 1: |
| type_, case_no, plan_id, member_id = lines[0].split(" ") |
| else: |
| type_ = lines[0] |
| case_no, plan_id, member_id = lines[1].split(" ") |
| row["type"] = type_ |
| row["case_no"] = case_no |
| row["plan_id"] = plan_id |
| row["member_id"] = member_id |
| state = "row_2" |
| elif state == "row_2": |
| pickup_info = text |
| row["pickup_info"] = pickup_info |
| state = "row_3" |
| elif state == "row_3": |
| text = text.replace("Pickup:", "") |
| text = text.strip() |
| text = text.replace("\n", " ") |
| pickup_time = text |
| row["pickup_time"] = pickup_time |
| state = "row_4" |
| elif state == "row_4": |
| lines = text.split("\n") |
| assert len(lines) in (3, 4) |
| num_of_one_way_trips = int(lines[0]) |
| vehicle_type = lines[1] |
| num_of_riders = int(lines[2]) |
| if len(lines) == 4: |
| auth_number = lines[3] |
| else: |
| auth_number = None |
| row["num_of_one_way_trips"] = num_of_one_way_trips |
| row["vehicle_type"] = vehicle_type |
| row["num_of_riders"] = num_of_riders |
| row["auth_number"] = auth_number |
| state = "row_5" |
| elif state == "row_5": |
| dest_info = text |
| row["dest_info"] = dest_info |
| state = "row_6" |
| elif state == "row_6": |
| if x0 >= 700: |
| fidelis_agent = text |
| row["special_needs_and_comments"] = None |
| row["fidelis_agent"] = fidelis_agent |
| rows.append(row) |
| row = {} |
| state = "row_1" |
| elif x1 > 719: |
| lines = text.split("\n") |
| special_needs_and_comments = "\n".join(lines[:-1]) |
| fidelis_agent = lines[-1] |
| row["special_needs_and_comments"] = special_needs_and_comments |
| row["fidelis_agent"] = fidelis_agent |
| rows.append(row) |
| row = {} |
| state = "row_1" |
| else: |
| special_needs_and_comments = text |
| row["special_needs_and_comments"] = special_needs_and_comments |
| state = "row_7" |
| elif state == "row_7": |
| if x0 >= 700: |
| fidelis_agent = text |
| row["fidelis_agent"] = fidelis_agent |
| rows.append(row) |
| row = {} |
| state = "row_1" |
| else: |
| special_needs_and_comments += text |
| row["special_needs_and_comments"] = special_needs_and_comments |
| state = "row_8" |
| elif state == "row_8": |
| assert x0 >= 700 |
| fidelis_agent = text |
| row["fidelis_agent"] = fidelis_agent |
| rows.append(row) |
| row = {} |
| state = "row_1" |
|
|
| metadata = f"""\ |
| Report Run Date Time: {report_run_date_time} |
| Transportation Provider Name: {transportation_provider_name} |
| Number of Items: {len(rows)} |
| """ |
|
|
| json_data = { |
| "report_run_date_time": report_run_date_time, |
| "transportation_provider_name": transportation_provider_name, |
| "itmes": rows, |
| } |
|
|
| output_path = f"output_{random.randint(0, 1000000):08d}.json" |
|
|
| with open(output_path, "w") as f: |
| json.dump(json_data, f, indent=4) |
|
|
| return metadata, output_path |
|
|
|
|
| def main(): |
| app = gr.Interface( |
| fn=parse_pdf, |
| inputs=gr.File(label="PDF File"), |
| outputs=[ |
| gr.Textbox(label="Metadata", lines=7), |
| gr.DownloadButton(label="Download JSON"), |
| ], |
| allow_flagging=False, |
| ) |
| app.launch() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|