| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| convert table label to html |
| """ |
|
|
| import json |
| import argparse |
| from tqdm import tqdm |
|
|
|
|
| def save_pred_txt(key, val, tmp_file_path): |
| with open(tmp_file_path, "a+", encoding="utf-8") as f: |
| f.write("{}\t{}\n".format(key, val)) |
|
|
|
|
| def skip_char(text, sp_char_list): |
| """ |
| skip empty cell |
| @param text: text in cell |
| @param sp_char_list: style char and special code |
| @return: |
| """ |
| for sp_char in sp_char_list: |
| text = text.replace(sp_char, "") |
| return text |
|
|
|
|
| def gen_html(img): |
| """ |
| Formats HTML code from tokenized annotation of img |
| """ |
| html_code = img["html"]["structure"]["tokens"].copy() |
| to_insert = [i for i, tag in enumerate(html_code) if tag in ("<td>", ">")] |
| for i, cell in zip(to_insert[::-1], img["html"]["cells"][::-1]): |
| if cell["tokens"]: |
| text = "".join(cell["tokens"]) |
| |
| sp_char_list = ["<b>", "</b>", "\u2028", " ", "<i>", "</i>"] |
| text_remove_style = skip_char(text, sp_char_list) |
| if len(text_remove_style) == 0: |
| continue |
| html_code.insert(i + 1, text) |
| html_code = "".join(html_code) |
| html_code = "<html><body><table>{}</table></body></html>".format(html_code) |
| return html_code |
|
|
|
|
| def load_gt_data(gt_path): |
| """ |
| load gt |
| @param gt_path: |
| @return: |
| """ |
| data_list = {} |
| with open(gt_path, "rb") as f: |
| lines = f.readlines() |
| for line in tqdm(lines): |
| data_line = line.decode("utf-8").strip("\n") |
| info = json.loads(data_line) |
| data_list[info["filename"]] = info |
| return data_list |
|
|
|
|
| def convert(origin_gt_path, save_path): |
| """ |
| gen html from label file |
| @param origin_gt_path: |
| @param save_path: |
| @return: |
| """ |
| data_dict = load_gt_data(origin_gt_path) |
| for img_name, gt in tqdm(data_dict.items()): |
| html = gen_html(gt) |
| save_pred_txt(img_name, html, save_path) |
| print("convert finish") |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="args for paddleserving") |
| parser.add_argument("--ori_gt_path", type=str, required=True, help="label gt path") |
| parser.add_argument( |
| "--save_path", type=str, required=True, help="path to save file" |
| ) |
| args = parser.parse_args() |
| return args |
|
|
|
|
| if __name__ == "__main__": |
| args = parse_args() |
| convert(args.ori_gt_path, args.save_path) |
|
|