shahidul034 commited on
Commit
9c6961c
·
verified ·
1 Parent(s): 034cb04

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/added_tokens.json +28 -0
  2. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/chat_template.jinja +61 -0
  3. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/config.json +68 -0
  4. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/merges.txt +0 -0
  5. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/special_tokens_map.json +31 -0
  6. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/vocab.json +0 -0
  7. code/RL_model/models/converted_model/v1/added_tokens.json +28 -0
  8. code/RL_model/models/converted_model/v1/chat_template.jinja +61 -0
  9. code/RL_model/models/converted_model/v1/config.json +68 -0
  10. code/RL_model/models/converted_model/v1/generation_config.json +13 -0
  11. code/RL_model/models/converted_model/v1/merges.txt +0 -0
  12. code/RL_model/models/converted_model/v1/model.safetensors.index.json +407 -0
  13. code/RL_model/models/converted_model/v1/special_tokens_map.json +31 -0
  14. code/RL_model/models/converted_model/v1/tokenizer_config.json +239 -0
  15. code/RL_model/models/converted_model/v1/vocab.json +0 -0
  16. code/finetune-inference/old/api_call.py +125 -0
  17. code/finetune-inference/old/api_call_vllm.py +135 -0
  18. code/finetune-inference/old/attribution_reasoning.py +198 -0
  19. code/finetune-inference/old/completeness_conciseness_attribution_cal.py +151 -0
  20. code/finetune-inference/old/completeness_reasoning_v1.py +186 -0
  21. code/finetune-inference/old/completeness_reasoning_v2.py +186 -0
  22. code/finetune-inference/old/completeness_reasoning_v3.py +171 -0
  23. code/finetune-inference/old/extracting_subclaims.py +196 -0
  24. code/finetune-inference/old/extracting_subclaims_v2.py +170 -0
  25. code/finetune-inference/old/extracting_subclaims_v3.py +175 -0
  26. code/finetune-inference/old/inference.py +91 -0
  27. code/finetune-inference/old/inferenceV2_without_context.py +137 -0
  28. code/finetune-inference/old/inferenceV3.py +161 -0
  29. code/finetune-inference/old/inferenceV3_temp.py +144 -0
  30. code/finetune-inference/old/inferenceV4.py +154 -0
  31. code/finetune-inference/old/inference_extract_subclaims.py +162 -0
  32. code/finetune-inference/old/inference_extract_subclaims_v2.py +179 -0
  33. code/finetune-inference/old/inference_extract_subclaims_v3.py +182 -0
  34. code/finetune-inference/old/nemotran_inference.py +174 -0
  35. code/finetune-inference/old/prompt_generate.py +254 -0
  36. code/finetune-inference/old/statistics.ipynb +400 -0
  37. code/finetune-inference/subclaim_support/readctrl_model.code-workspace +13 -0
  38. code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_gpt5.py +206 -0
  39. code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_v4.py +180 -0
  40. code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_vllm.py +163 -0
  41. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal.py +248 -0
  42. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_tesing_v2.py +203 -0
  43. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v2.py +304 -0
  44. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v3.py +256 -0
  45. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v4.py +309 -0
  46. code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v5.py +281 -0
  47. code/finetune-inference/subclaim_support_extraction/readctrl_model.code-workspace +18 -0
  48. code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing.py +199 -0
  49. code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing_v2.py +138 -0
  50. code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing_v3.py +131 -0
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "float32",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 262144,
53
+ "max_window_layers": 36,
54
+ "model_type": "qwen3",
55
+ "num_attention_heads": 32,
56
+ "num_hidden_layers": 36,
57
+ "num_key_value_heads": 8,
58
+ "pad_token_id": 151643,
59
+ "rms_norm_eps": 1e-06,
60
+ "rope_scaling": null,
61
+ "rope_theta": 5000000,
62
+ "sliding_window": null,
63
+ "tie_word_embeddings": true,
64
+ "transformers_version": "4.56.1",
65
+ "use_cache": true,
66
+ "use_sliding_window": false,
67
+ "vocab_size": 151936
68
+ }
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
code/RL_model/models/converted_model/v1/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
code/RL_model/models/converted_model/v1/chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
code/RL_model/models/converted_model/v1/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 262144,
53
+ "max_window_layers": 36,
54
+ "model_type": "qwen3",
55
+ "num_attention_heads": 32,
56
+ "num_hidden_layers": 36,
57
+ "num_key_value_heads": 8,
58
+ "pad_token_id": 151643,
59
+ "rms_norm_eps": 1e-06,
60
+ "rope_scaling": null,
61
+ "rope_theta": 5000000,
62
+ "sliding_window": null,
63
+ "tie_word_embeddings": true,
64
+ "transformers_version": "4.56.1",
65
+ "use_cache": true,
66
+ "use_sliding_window": false,
67
+ "vocab_size": 151936
68
+ }
code/RL_model/models/converted_model/v1/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.56.1"
13
+ }
code/RL_model/models/converted_model/v1/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
code/RL_model/models/converted_model/v1/model.safetensors.index.json ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4411424256,
4
+ "total_size": 8822848512
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00001-of-00002.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
32
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
37
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
41
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
43
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
44
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
46
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
48
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
51
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
54
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
56
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
57
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
58
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
60
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
61
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
63
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
64
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
68
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
69
+ "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
70
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
71
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
75
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
76
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
78
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
79
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
81
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
82
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
83
+ "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
84
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
85
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
86
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
88
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
89
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
93
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
94
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
100
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
102
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
104
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
106
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
107
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
108
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
109
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
113
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
114
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
115
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
116
+ "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
117
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
118
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
124
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
132
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
133
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
134
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
135
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
138
+ "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
139
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
140
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
141
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
142
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
144
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
147
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
149
+ "model.layers.2.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
150
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
159
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
161
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
244
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
256
+ "model.layers.29.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
259
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.3.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
268
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
269
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
271
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
273
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
276
+ "model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
277
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
279
+ "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
280
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
282
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors",
285
+ "model.layers.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
286
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
292
+ "model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
294
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
297
+ "model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
299
+ "model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
300
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
302
+ "model.layers.32.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
303
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.32.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
305
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.input_layernorm.weight": "model-00001-of-00002.safetensors",
307
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
310
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
313
+ "model.layers.33.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
314
+ "model.layers.33.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
315
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.34.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
328
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
333
+ "model.layers.35.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
334
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
337
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.5.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
364
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
367
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
373
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
375
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
376
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
379
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
384
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
385
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
386
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
387
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
389
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
390
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
391
+ "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
392
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
393
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
394
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
395
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
396
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
398
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
403
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
405
+ "model.norm.weight": "model-00001-of-00002.safetensors"
406
+ }
407
+ }
code/RL_model/models/converted_model/v1/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
code/RL_model/models/converted_model/v1/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 1010000,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
code/RL_model/models/converted_model/v1/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
code/finetune-inference/old/api_call.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import re
3
+
4
+ client = OpenAI()
5
+
6
+ # --- Fernández Huerta formula ---
7
+ def fernandez_huerta_score(text: str) -> float:
8
+ sentences = re.split(r'[.!?]+', text)
9
+ sentences = [s.strip() for s in sentences if s.strip()]
10
+ n_sentences = len(sentences) if sentences else 1
11
+
12
+ words = text.split()
13
+ n_words = len(words) if words else 1
14
+
15
+ vowels = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ"
16
+ n_syllables = sum(sum(1 for ch in word if ch in vowels) for word in words)
17
+
18
+ return 206.84 - 0.60 * (n_syllables / n_words * 100) - 1.02 * (n_words / n_sentences)
19
+
20
+
21
+ # --- Prompt templates for each label ---
22
+ LABEL_PROMPTS = {
23
+ "easy": """Texto original:
24
+ {original_text}
25
+
26
+ Reescribe el texto en un lenguaje muy simple, frases cortas y vocabulario fácil, adecuado para estudiantes de 5º a 7º grado.
27
+ El resultado debe seguir lógicamente el texto original y mantener el mismo significado.
28
+ No añadas información nueva, no elimines detalles importantes ni cambies los hechos.
29
+ """,
30
+ "intermediate": """Texto original:
31
+ {original_text}
32
+
33
+ Reescribe el texto con una complejidad moderada, frases más largas y vocabulario variado, adecuado para secundaria/bachillerato (8º a 12º grado).
34
+ El resultado debe seguir lógicamente el texto original y mantener el mismo significado.
35
+ No añadas información nueva, no elimines detalles importantes ni cambies los hechos.
36
+ """,
37
+ "hard": """Texto original:
38
+ {original_text}
39
+
40
+ Reescribe el texto con lenguaje técnico, detallado y especializado, adecuado para universidad o profesionales.
41
+ El resultado debe seguir lógicamente el texto original y mantener el mismo significado.
42
+ No añadas información nueva, no elimines detalles importantes ni cambies los hechos.
43
+ """
44
+ }
45
+
46
+
47
+ # --- Generate text for a label ---
48
+ def generate_label_text(original_text: str, label: str) -> str:
49
+ prompt = LABEL_PROMPTS[label].format(original_text=original_text)
50
+ response = client.chat.completions.create(
51
+ model="gpt-5-mini", # first try with mini
52
+ messages=[{"role": "user", "content": prompt}]
53
+ )
54
+ return response.choices[0].message.content.strip()
55
+
56
+
57
+ # --- Regenerate if FH score is out of range ---
58
+ def regenerate_label_text(original_text: str, old_text: str, label: str, target_range: tuple) -> str:
59
+ prompt = f"""Texto original:
60
+ {original_text}
61
+
62
+ Texto generado (necesita ajuste):
63
+ {old_text}
64
+
65
+ El texto anterior no cumple con el rango de legibilidad {target_range}.
66
+ Reescribe nuevamente el texto en el nivel "{label}", ajustando la dificultad
67
+ para que el puntaje de Fernández Huerta quede dentro del rango {target_range}.
68
+ El resultado debe seguir lógicamente el texto original y mantener el mismo significado.
69
+ No añadas información nueva, no elimines detalles importantes ni cambies los hechos.
70
+ """
71
+ response = client.chat.completions.create(
72
+ model="gpt-5", # use stronger model for regeneration
73
+ messages=[{"role": "user", "content": prompt}]
74
+ )
75
+ return response.choices[0].message.content.strip()
76
+
77
+
78
+
79
+ # --- Target ranges for FH ---
80
+ RANGES = {
81
+ "easy": (70, 100),
82
+ "intermediate": (50, 70),
83
+ "hard": (0, 50)
84
+ }
85
+
86
+
87
+ # --- Full pipeline for one topic ---
88
+ def generate_synthetic_data(original_text: str, original_language: str, topic: str, data_id: int):
89
+ results = {
90
+ "id": data_id,
91
+ "original_text_language": original_language,
92
+ "source_topic": topic,
93
+ "readability_versions": {}
94
+ }
95
+
96
+ for label, target_range in RANGES.items():
97
+ # Step 1: generate
98
+ text = generate_label_text(original_text, label)
99
+
100
+ # Step 2: check FH score
101
+ score = fernandez_huerta_score(text)
102
+ if not (target_range[0] <= score <= target_range[1]):
103
+ text = regenerate_label_text(original_text, text, label, target_range)
104
+
105
+
106
+ # Step 4: save
107
+ results["readability_versions"][label] = {
108
+ "readability_level": label,
109
+ "fernandez_huerta_range": f"{target_range[0]}-{target_range[1]}",
110
+ "target_audience": (
111
+ "Estudiantes de primaria/media (5º a 7º grado)" if label == "easy" else
112
+ "Secundaria/Bachillerato (8º a 12º grado)" if label == "intermediate" else
113
+ "Profesionales / Universidad o posgrado"
114
+ ),
115
+ "text": text
116
+ }
117
+
118
+ return results
119
+
120
+
121
+ # --- Example usage ---
122
+ if __name__ == "__main__":
123
+ original_text = "Se diagnosticó osteoartritis bilateral en un paciente de 61 años con dolor en la ingle."
124
+ data = generate_synthetic_data(original_text, "es", "Osteoartritis de cadera", 1)
125
+ print(data)
code/finetune-inference/old/api_call_vllm.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ # Ensure this matches the model path used in your run_vllm.sh script
11
+ MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"
12
+ API_URL = "http://localhost:8015/v1"
13
+ API_KEY = "EMPTY" # vLLM requires a key, but it can be anything if not set on server
14
+
15
+ # Initialize Client
16
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
17
+
18
+ # -----------------------------
19
+ # SUBCLAIM EXTRACTION PROMPT
20
+ # -----------------------------
21
+ def extraction_prompt(medical_text: str) -> str:
22
+ prompt = f"""
23
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
24
+ A subclaim is the smallest standalone factual unit that can be independently verified.
25
+ Instructions:
26
+ 1. Read the provided medical text.
27
+ 2. Break it into clear, objective, atomic subclaims.
28
+ 3. Each subclaim must come directly from the text.
29
+ 4. Do not add, guess, or infer information.
30
+ 5. Each subclaim should be short, specific, and verifiable.
31
+ 6. Return ONLY a Python-style list of strings.
32
+ Medical Text:
33
+ {medical_text}
34
+ Return your output in JSON list format, like:
35
+ [
36
+ "subclaim 1",
37
+ "subclaim 2",
38
+ ...
39
+ ]
40
+ """
41
+ return prompt
42
+
43
+ # -----------------------------
44
+ # INFERENCE FUNCTION (vLLM)
45
+ # -----------------------------
46
+ def infer_subclaims(medical_text: str, temperature: float = 0.2) -> str:
47
+ """Sends prompt to vLLM server and returns generated text."""
48
+
49
+ # 1. Prepare the prompt
50
+ final_prompt = extraction_prompt(medical_text)
51
+
52
+ # 2. Call the vLLM Server via OpenAI API
53
+ try:
54
+ response = client.chat.completions.create(
55
+ model=MODEL_NAME,
56
+ messages=[
57
+ {"role": "user", "content": final_prompt}
58
+ ],
59
+ max_tokens=1000, # Limit generation length
60
+ temperature=temperature,
61
+ top_p=0.9,
62
+ frequency_penalty=0.0,
63
+ presence_penalty=0.0,
64
+ )
65
+ res = response.choices[0].message.content.strip()
66
+ res=res.split("</think>")[-1].strip()
67
+ return res
68
+ except Exception as e:
69
+ print(f"Error during API call: {e}")
70
+ return None
71
+
72
+ # -----------------------------
73
+ # MAIN EXECUTION
74
+ # -----------------------------
75
+ if __name__ == "__main__":
76
+ parser = argparse.ArgumentParser()
77
+ parser.add_argument("--input_file", type=str, required=True,
78
+ help="Path to the input JSON file containing medical texts.")
79
+ args = parser.parse_args()
80
+
81
+ INPUT_FILE = args.input_file
82
+ file_name = os.path.basename(INPUT_FILE).split(".json")[0]
83
+
84
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
85
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
86
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}.json")
87
+
88
+ # Load input dataset
89
+ with open(INPUT_FILE, "r") as f:
90
+ data = json.load(f)
91
+
92
+ # Load existing results (resume mode)
93
+ result = []
94
+ if os.path.exists(OUTPUT_FILE):
95
+ with open(OUTPUT_FILE, "r") as f:
96
+ try:
97
+ result = json.load(f)
98
+ except json.JSONDecodeError:
99
+ result = []
100
+
101
+ existing_ids = {item["id"] for item in result}
102
+
103
+ print(f"Starting inference on {len(data)} items using vLLM server...")
104
+ save=False
105
+ # --------------------------------------------------------
106
+ # PROCESS EACH MEDICAL TEXT
107
+ # --------------------------------------------------------
108
+ for item in tqdm.tqdm(data):
109
+ if item["id"] in existing_ids:
110
+ continue
111
+
112
+ medical_text = item.get("fulltext", "")
113
+
114
+ # Call the vLLM inference function
115
+ extracted = infer_subclaims(medical_text)
116
+
117
+ result.append({
118
+ "id": item["id"],
119
+ "medical_text": medical_text,
120
+ "subclaims": extracted,
121
+ "summary": item.get("summary", "")
122
+ })
123
+
124
+ # Save every 20 entries
125
+ if len(result) % 20 == 0:
126
+ with open(OUTPUT_FILE, "w") as f:
127
+ if save:
128
+ json.dump(result, f, indent=4, ensure_ascii=False)
129
+
130
+ # Final save
131
+ with open(OUTPUT_FILE, "w") as f:
132
+ if save:
133
+ json.dump(result, f, indent=4, ensure_ascii=False)
134
+
135
+ print(f"Extraction completed. Saved to {OUTPUT_FILE}")
code/finetune-inference/old/attribution_reasoning.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ from openai import OpenAI
4
+ import ast,os
5
+ # ===========================
6
+ # CONFIGURATION
7
+ # ===========================
8
+ MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged"
9
+ VLLM_API_URL = "http://localhost:8004/v1"
10
+ VLLM_API_KEY = "EMPTY"
11
+
12
+ # Initialize Client
13
+ client = OpenAI(
14
+ base_url=VLLM_API_URL,
15
+ api_key=VLLM_API_KEY,
16
+ )
17
+
18
+ # ===========================
19
+ # INFERENCE FUNCTION
20
+ # ===========================
21
+ def infer_reasonableness(
22
+ fulltext: str,
23
+ generated_summary: str,
24
+ readability_level: str,
25
+ subclaim_text: str,
26
+ result: int,
27
+ ):
28
+ """
29
+ Predict reasonableness using the local vLLM server.
30
+ No error handling: validation or connection errors will raise exceptions.
31
+ """
32
+
33
+ # ---- Build inference prompt ----
34
+ prompt = f"""
35
+ ### **SYSTEM / ROLE INSTRUCTION**
36
+
37
+ You are a **medical factuality and attribution evaluator**.
38
+ You will assess whether the **unsupported subclaim** in a generated summary (when `"result": 0"`) is a *reasonable addition* given the readability level (*easy / intermediate / hard*).
39
+
40
+ The goal is to decide whether this **extra piece of information** is an acceptable simplification or a *hallucination* that reduces factual faithfulness.
41
+
42
+ ---
43
+
44
+ ### **READABILITY & ATTRIBUTION GUIDELINES**
45
+
46
+ | Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions |
47
+ | :-- | :-- | :-- | :-- | :-- |
48
+ | **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** |
49
+ | **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** |
50
+ | **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. |
51
+
52
+ ---
53
+
54
+ ### **Input**
55
+
56
+ ```
57
+ Readability Level: {readability_level}
58
+
59
+ Reference Full Text:
60
+ {fulltext}
61
+
62
+ Generated Summary:
63
+ {generated_summary}
64
+
65
+ Subclaim: "{subclaim_text}"
66
+ Result: {result} # 1 = supported (included), 0 = unsupported
67
+ ```
68
+
69
+ ---
70
+
71
+ ### **TASK INSTRUCTIONS**
72
+
73
+ If `"result": 0"`, judge whether including this subclaim is **reasonable** for the given readability level.
74
+ Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
75
+ Provide a **1–2 sentence justification** describing your reasoning.
76
+
77
+ ---
78
+
79
+ ### **Output Format**
80
+
81
+ Return structured JSON:
82
+
83
+ ```json
84
+ {{
85
+ "evaluation": {{
86
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
87
+ "justification": "<short explanation>"
88
+ }}
89
+ }}
90
+ ```
91
+ """.strip()
92
+
93
+ messages = [{"role": "user", "content": prompt}]
94
+
95
+ # ---- Call vLLM Server ----
96
+ response = client.chat.completions.create(
97
+ model=MODEL_NAME,
98
+ messages=messages,
99
+ temperature=0.2,
100
+ max_tokens=200,
101
+ top_p=0.8,
102
+ )
103
+
104
+ output_text = response.choices[0].message.content
105
+
106
+ # ---- Clean Output (Handle Thinking & Markdown) ----
107
+ try:
108
+ if "</think>" in output_text:
109
+ output_text = output_text.split("</think>")[1]
110
+
111
+ clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
112
+ # import ipdb; ipdb.set_trace()
113
+ t=ast.literal_eval(clean_text)
114
+
115
+ # ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
116
+ return t
117
+ except Exception as e:
118
+ return output_text
119
+
120
+
121
+ # ===========================
122
+ # MAIN EXECUTION
123
+ # ===========================
124
+ if __name__ == "__main__":
125
+ import argparse
126
+ parser = argparse.ArgumentParser()
127
+ parser.add_argument("--data_path", type=str, required=True,
128
+ help="Path to the JSON file containing evaluation data.")
129
+ args = parser.parse_args()
130
+ data_path = args.data_path
131
+ # data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
132
+ file_name=os.path.basename(data_path)
133
+
134
+ # Open file directly (Will raise FileNotFoundError if missing)
135
+ with open(data_path, 'r') as f:
136
+ dataset = json.load(f)
137
+
138
+ # print(f"Loaded {len(dataset)} examples. Starting inference...")
139
+ save_path = f'/home/mshahidul/readctrl/data/attribution_reasoning_result/{file_name}'
140
+ os.makedirs('/home/mshahidul/readctrl/data/attribution_reasoning_result/', exist_ok=True)
141
+ full_results = []
142
+ if os.path.exists(save_path):
143
+ with open(save_path, 'r') as f:
144
+ full_results = json.load(f)
145
+
146
+ import tqdm
147
+ for item in tqdm.tqdm(dataset):
148
+ if any(d['id'] == item['id'] for d in full_results):
149
+ continue
150
+ fulltext = item['fulltext']
151
+ temp2={}
152
+ for label in ['easy', 'intermediate', 'hard']:
153
+ generated_summary = item[f'{label}_text']
154
+ subclaim_list = item['metrics'][f'{label}']['attribution']['details']
155
+ temp=[]
156
+ for idx, subclaim in enumerate(subclaim_list):
157
+
158
+ # Check status (assumes subclaim variable holds the status string)
159
+ result = 1 if subclaim['label'] == 'supported' else 0
160
+
161
+ if result ==0:
162
+ output = infer_reasonableness(
163
+ fulltext=fulltext,
164
+ generated_summary=generated_summary,
165
+ readability_level=label,
166
+ subclaim_text=subclaim['subclaim'],
167
+ result=result,
168
+ )
169
+
170
+ temp.append({
171
+ 'subclaim': subclaim['subclaim'],
172
+ 'output': output
173
+ })
174
+ else:
175
+ temp.append({
176
+ 'subclaim': subclaim['subclaim'],
177
+ 'output': {
178
+ 'reasonableness': 'reasonable',
179
+ 'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
180
+ }
181
+ })
182
+
183
+ temp2[label] = {
184
+ 'results': temp
185
+ }
186
+ full_results.append({
187
+ 'id': item['id'],
188
+ 'completeness': temp2
189
+ })
190
+ if len(full_results) % 10 == 0:
191
+ with open(save_path, 'w') as f:
192
+ json.dump(full_results, f, indent=2, ensure_ascii=False)
193
+
194
+ with open(save_path, 'w') as f:
195
+ json.dump(full_results, f, indent=2, ensure_ascii=False)
196
+
197
+
198
+
code/finetune-inference/old/completeness_conciseness_attribution_cal.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
4
+ import torch
5
+ from unsloth import FastLanguageModel
6
+ import json
7
+
8
+ # Optional: wrap model/tokenizer in a singleton pattern for repeated use
9
+ _model_cache = {"model": None, "tokenizer": None}
10
+
11
+ def load_finetuned_model(model_path: str):
12
+ """Load and cache your fine‑tuned model + tokenizer."""
13
+ if _model_cache["model"] is not None:
14
+ return _model_cache["model"], _model_cache["tokenizer"]
15
+
16
+
17
+ model, tokenizer = FastLanguageModel.from_pretrained(
18
+ model_name=model_path,
19
+ max_seq_length=4092,
20
+ load_in_4bit=False,
21
+ load_in_8bit=False,
22
+ full_finetuning=False,
23
+ )
24
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
25
+ return model, tokenizer
26
+
27
+
28
+ def infer_subclaim(text: str, subclaim: str, model_path: str = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-verifier_lora_nonreasoning", cuda_device: str = "0") -> str:
29
+ """
30
+ Given a medical text and a subclaim, returns '1' if the text supports the subclaim, otherwise '0'.
31
+ """
32
+ model, tokenizer = load_finetuned_model(model_path)
33
+
34
+ # Build prompt (the same structure you trained on)
35
+ prompt = f"""
36
+ Given the following medical text and subclaim, decide if the text supports the subclaim.
37
+ Text: {text}
38
+ Subclaim: {subclaim}
39
+ Respond only with 1 if the text supports the subclaim, otherwise 0.
40
+ """.strip()
41
+
42
+ messages = [{"role": "user", "content": prompt + "\n"}]
43
+
44
+ chat_text = tokenizer.apply_chat_template(
45
+ messages,
46
+ tokenize=False,
47
+ add_generation_prompt=True,
48
+ enable_thinking=False,
49
+ )
50
+
51
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
52
+
53
+ with torch.no_grad():
54
+ output_ids = model.generate(
55
+ **inputs,
56
+ max_new_tokens=10,
57
+ temperature=0.1,
58
+ top_p=0.8,
59
+ top_k=5,
60
+ )
61
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
62
+ return output_text.split("</think>")[1].strip()
63
+
64
+ if __name__ == "__main__":
65
+ # example_text = (
66
+ # "Una niña nacida a las 34 semanas de gestación precisó intubación y ventilación al nacer..."
67
+ # )
68
+ # example_subclaim = "La paciente es una recién nacida prematura."
69
+
70
+ def process_completeness(example,version):
71
+ example_text = example["readability_versions"][version]['text']
72
+ example_subclaims = example['ref_summary']["subclaims"]
73
+ # print("Input text:", example_text)
74
+ res=[]
75
+ total=0
76
+ correct=0
77
+ for example_subclaim in example_subclaims:
78
+ result = infer_subclaim(example_text, example_subclaim)
79
+ if "1" in result:
80
+ correct+=1
81
+ total+=1
82
+ elif "0" in result:
83
+ total+=1
84
+ res.append({
85
+ "subclaim": example_subclaim,
86
+ "result": result
87
+ })
88
+ return {"metric": "completeness", "version": version, "input_text": example_text, "results": res, "total": total, "correct": correct, "accuracy": (correct/total)*100 if total>0 else 0}
89
+
90
+ def process_conciseness(example, version):
91
+ example_text = example["ref_summary"]['text']
92
+ example_subclaims = example["readability_versions"][version]["subclaims"]
93
+ # print("Input text:", example_text)
94
+ res=[]
95
+ total=0
96
+ correct=0
97
+ for example_subclaim in example_subclaims:
98
+ result = infer_subclaim(example_text, example_subclaim)
99
+
100
+ if "1" in result:
101
+ correct+=1
102
+ total+=1
103
+ elif "0" in result:
104
+ total+=1
105
+ res.append({
106
+ "subclaim": example_subclaim,
107
+ "result": result
108
+ })
109
+ return {"metric": "conciseness", "version": version, "input_text": example_text, "results": res, "total": total, "correct": correct, "accuracy": (correct/total)*100 if total>0 else 0}
110
+ def process_attribution(example, version):
111
+ example_text = example['full_text']
112
+ example_subclaims = example["readability_versions"][version]["subclaims"]
113
+ # print("Input text:", example_text)
114
+ res=[]
115
+ total=0
116
+ correct=0
117
+ for example_subclaim in example_subclaims:
118
+ result = infer_subclaim(example_text, example_subclaim)
119
+ if "1" in result:
120
+ correct+=1
121
+ total+=1
122
+ elif "0" in result:
123
+ total+=1
124
+ res.append({
125
+ "subclaim": example_subclaim,
126
+ "result": result
127
+ })
128
+ return {"metric": "attribution", "version": version, "input_text": example_text, "results": res, "total": total, "correct": correct, "accuracy": (correct/total)*100 if total>0 else 0}
129
+ with open("/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json", "r", encoding="utf-8") as f:
130
+ data = json.load(f)
131
+ import tqdm
132
+ full_data_results = []
133
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
134
+ for item in tqdm.tqdm(data):
135
+ print(f"Processing item ID: {item['id']}")
136
+ for version in ["easy", "intermediate", "hard"]:
137
+ completeness=process_completeness(item,version)
138
+ conciseness=process_conciseness(item,version)
139
+ attribution=process_attribution(item,version)
140
+ full_data_results.append({
141
+ "id": item["id"],
142
+ "version": version,
143
+ "completeness": completeness,
144
+ "conciseness": conciseness,
145
+ "attribution": attribution
146
+ })
147
+ if len(full_data_results)%5==0:
148
+ with open(save_path, "w", encoding="utf-8") as f:
149
+ json.dump(full_data_results, f, indent=4, ensure_ascii=False)
150
+ with open(save_path, "w", encoding="utf-8") as f:
151
+ json.dump(full_data_results, f, indent=4, ensure_ascii=False)
code/finetune-inference/old/completeness_reasoning_v1.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "4"
4
+ import torch
5
+ from unsloth import FastLanguageModel
6
+ import json
7
+
8
+ # ===========================
9
+ # GPU SETTINGS
10
+ # ===========================
11
+
12
+
13
+ # ===========================
14
+ # MODEL LOADING (CACHED)
15
+ # ===========================
16
+ _model_cache = {"model": None, "tokenizer": None}
17
+
18
+ def load_finetuned_model(model_path: str):
19
+ """Load and cache the fine-tuned model + tokenizer."""
20
+ if _model_cache["model"] is not None:
21
+ return _model_cache["model"], _model_cache["tokenizer"]
22
+
23
+ model, tokenizer = FastLanguageModel.from_pretrained(
24
+ model_name=model_path,
25
+ max_seq_length=4096,
26
+ load_in_4bit=False,
27
+ load_in_8bit=False,
28
+ full_finetuning=False,
29
+ )
30
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
31
+ return model, tokenizer
32
+
33
+
34
+ # ===========================
35
+ # INFERENCE FUNCTION
36
+ # ===========================
37
+ def infer_reasonableness(
38
+ reference_summary: str,
39
+ generated_summary: str,
40
+ readability_level: str,
41
+ subclaim_text: str,
42
+ result: int,
43
+ model_path: str = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-completeness_resonability_check_v2",
44
+ ):
45
+ """
46
+ Given the reference summary, generated summary, readability level, subclaim, and its result (0/1),
47
+ predict reasonableness: reasonable / partially_reasonable / unreasonable, plus justification.
48
+ """
49
+ model, tokenizer = load_finetuned_model(model_path)
50
+
51
+ # ---- Build inference prompt (same structure as training) ----
52
+ prompt = f"""
53
+ You are an impartial medical summarization evaluator.
54
+
55
+ Goal:
56
+ Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.
57
+
58
+ Readability Criteria:
59
+ - Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
60
+ - Intermediate: for general educated readers; keep main findings but simplify phrasing.
61
+ - Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.
62
+
63
+ Judging rules:
64
+ * Base your decision strictly on what appears in the generated summary.
65
+ * If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
66
+ * If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
67
+ * Stay consistent between `result`, justification, and readability level.
68
+
69
+ ### Inputs
70
+ Readability Level: {readability_level}
71
+ Reference Summary: {reference_summary}
72
+ Generated Summary: {generated_summary}
73
+ Subclaim: "{subclaim_text}"
74
+ Result: {result} # 1 = supported (included), 0 = omitted
75
+
76
+ ### Task
77
+ Respond **only** with the following JSON object:
78
+
79
+ {{
80
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
81
+ "justification": "<short clear explanation>"
82
+ }}
83
+ """.strip()
84
+
85
+ messages = [{"role": "user", "content": prompt + "\n"}]
86
+
87
+ chat_text = tokenizer.apply_chat_template(
88
+ messages,
89
+ tokenize=False,
90
+ add_generation_prompt=True,
91
+ enable_thinking=False, # important for Unsloth chat template
92
+ )
93
+
94
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
95
+
96
+ # ---- Generate output ----
97
+ with torch.no_grad():
98
+ output_ids = model.generate(
99
+ **inputs,
100
+ max_new_tokens=150,
101
+ temperature=0.2,
102
+ top_p=0.8,
103
+ top_k=5,
104
+ do_sample=False,
105
+ )
106
+
107
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
108
+ output_text = output_text.split("</think>")[1].strip()
109
+ # ---- Extract model JSON output ----
110
+ try:
111
+ parsed = json.loads(output_text)
112
+ except Exception:
113
+ # print("Failed to parse JSON from model output. Returning raw text.\n\n")
114
+ parsed = output_text
115
+ return parsed
116
+
117
+
118
+ # ===========================
119
+ # EXAMPLE USAGE
120
+ # ===========================
121
+ if __name__ == "__main__":
122
+ # reference_summary = "Una niña nacida a las 34 semanas de gestación precisó intubación..."
123
+ # generated_summary = "Esta es la historia de una niña que nació antes de tiempo, a las 34 semanas..."
124
+ # subclaim_text = "La paciente presentaba hiperinsulinismo en el período neonatal."
125
+ # readability_level = "easy"
126
+ # result = 0 # omitted
127
+ import json
128
+ with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json', 'r') as f:
129
+ multiclinsum_gs_train_es_data = json.load(f)
130
+ ref_summaries={}
131
+ fulltexts={}
132
+ for item in multiclinsum_gs_train_es_data:
133
+ ref_summaries[item['id']]=item['summary']
134
+ fulltexts[item['id']]=item['fulltext']
135
+
136
+ generated_summaries = {}
137
+ with open('/home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.json', 'r') as f:
138
+ synthetic_data_es_raw_592 = json.load(f)
139
+ for item in synthetic_data_es_raw_592:
140
+ for version in ['easy', 'intermediate', 'hard']:
141
+ generated_summaries[(item['id'], version)] = item['readability_versions'][version]['text']
142
+ # /home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json
143
+ with open("/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json", 'r') as f:
144
+ qwen3_32B_results = json.load(f)
145
+ full_res = []
146
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/completeness_resonability_check_100_qwen3-32B_v3.json"
147
+ import tqdm
148
+ for idx, item in tqdm.tqdm(enumerate(qwen3_32B_results)):
149
+ print(f"Processing item {idx + 1}/{len(qwen3_32B_results)}")
150
+ reference_summary = ref_summaries[item['id']]
151
+ fulltext = fulltexts[item['id']]
152
+ generated_summary = generated_summaries[(item['id'], item['version'])]
153
+ temp_res = []
154
+ for item2 in item['completeness']['results']:
155
+ subclaim_text = item2['subclaim']['subclaim']
156
+ result = item2['result']
157
+ if result =="1":
158
+ continue
159
+ response = infer_reasonableness(
160
+ reference_summary,
161
+ generated_summary,
162
+ item['version'],
163
+ subclaim_text,
164
+ result,
165
+ model_path="/home/mshahidul/readctrl_model/qwen3-32B_subclaims-completeness_resonability_check",
166
+ )
167
+ temp_res.append({
168
+ 'id':item2['subclaim']['id'],
169
+ "subclaim": subclaim_text,
170
+ "result": result,
171
+ "reasonableness": response
172
+ })
173
+ full_res.append({
174
+ "id": item['id'],
175
+ "version": item['version'],
176
+ "completeness": {
177
+ "results": temp_res
178
+ }
179
+ })
180
+ if len(full_res)%10==0:
181
+ with open(save_path, 'w') as f:
182
+ json.dump(full_res, f, indent=2, ensure_ascii=False)
183
+
184
+ with open(save_path, 'w') as f:
185
+ json.dump(full_res, f, indent=2, ensure_ascii=False)
186
+
code/finetune-inference/old/completeness_reasoning_v2.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "4"
4
+ import torch
5
+ from unsloth import FastLanguageModel
6
+ import json
7
+
8
+ # ===========================
9
+ # GPU SETTINGS
10
+ # ===========================
11
+
12
+
13
+ # ===========================
14
+ # MODEL LOADING (CACHED)
15
+ # ===========================
16
+ _model_cache = {"model": None, "tokenizer": None}
17
+
18
+ def load_finetuned_model(model_path: str):
19
+ """Load and cache the fine-tuned model + tokenizer."""
20
+ if _model_cache["model"] is not None:
21
+ return _model_cache["model"], _model_cache["tokenizer"]
22
+
23
+ model, tokenizer = FastLanguageModel.from_pretrained(
24
+ model_name=model_path,
25
+ max_seq_length=4096,
26
+ load_in_4bit=False,
27
+ load_in_8bit=False,
28
+ full_finetuning=False,
29
+ )
30
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
31
+ return model, tokenizer
32
+
33
+
34
+ # ===========================
35
+ # INFERENCE FUNCTION
36
+ # ===========================
37
+ def infer_reasonableness(
38
+ reference_summary: str,
39
+ generated_summary: str,
40
+ readability_level: str,
41
+ subclaim_text: str,
42
+ result: int,
43
+ model_path: str = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3",
44
+ ):
45
+ """
46
+ Given the reference summary, generated summary, readability level, subclaim, and its result (0/1),
47
+ predict reasonableness: reasonable / partially_reasonable / unreasonable, plus justification.
48
+ """
49
+ model, tokenizer = load_finetuned_model(model_path)
50
+
51
+ # ---- Build inference prompt (same structure as training) ----
52
+ prompt = f"""
53
+ You are an impartial medical summarization evaluator.
54
+
55
+ Goal:
56
+ Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.
57
+
58
+ Readability Criteria:
59
+ - Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
60
+ - Intermediate: for general educated readers; keep main findings but simplify phrasing.
61
+ - Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.
62
+
63
+ Judging rules:
64
+ * Base your decision strictly on what appears in the generated summary.
65
+ * If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
66
+ * If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
67
+ * Stay consistent between `result`, justification, and readability level.
68
+
69
+ ### Inputs
70
+ Readability Level: {readability_level}
71
+ Reference Summary: {reference_summary}
72
+ Generated Summary: {generated_summary}
73
+ Subclaim: "{subclaim_text}"
74
+ Result: {result} # 1 = supported (included), 0 = omitted
75
+
76
+ ### Task
77
+ Respond **only** with the following JSON object:
78
+
79
+ {{
80
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
81
+ "justification": "<short clear explanation>"
82
+ }}
83
+ """.strip()
84
+
85
+ messages = [{"role": "user", "content": prompt + "\n"}]
86
+
87
+ chat_text = tokenizer.apply_chat_template(
88
+ messages,
89
+ tokenize=False,
90
+ add_generation_prompt=True,
91
+ enable_thinking=False, # important for Unsloth chat template
92
+ )
93
+
94
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
95
+
96
+ # ---- Generate output ----
97
+ with torch.no_grad():
98
+ output_ids = model.generate(
99
+ **inputs,
100
+ max_new_tokens=150,
101
+ temperature=0.2,
102
+ top_p=0.8,
103
+ top_k=5,
104
+ do_sample=False,
105
+ )
106
+
107
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
108
+ output_text = output_text.split("</think>")[1].strip().replace("```json", "").replace("```", "")
109
+ # ---- Extract model JSON output ----
110
+ try:
111
+ parsed = json.loads(output_text)
112
+ except Exception:
113
+ # print("Failed to parse JSON from model output. Returning raw text.\n\n")
114
+ parsed = output_text
115
+ return parsed
116
+
117
+
118
+ # ===========================
119
+ # EXAMPLE USAGE
120
+ # ===========================
121
+ if __name__ == "__main__":
122
+ # reference_summary = "Una niña nacida a las 34 semanas de gestación precisó intubación..."
123
+ # generated_summary = "Esta es la historia de una niña que nació antes de tiempo, a las 34 semanas..."
124
+ # subclaim_text = "La paciente presentaba hiperinsulinismo en el período neonatal."
125
+ # readability_level = "easy"
126
+ # result = 0 # omitted
127
+ import json
128
+ with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json', 'r') as f:
129
+ multiclinsum_gs_train_es_data = json.load(f)
130
+ ref_summaries={}
131
+ fulltexts={}
132
+ for item in multiclinsum_gs_train_es_data:
133
+ ref_summaries[item['id']]=item['summary']
134
+ fulltexts[item['id']]=item['fulltext']
135
+
136
+ generated_summaries = {}
137
+ with open('/home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.json', 'r') as f:
138
+ synthetic_data_es_raw_592 = json.load(f)
139
+ for item in synthetic_data_es_raw_592:
140
+ for version in ['easy', 'intermediate', 'hard']:
141
+ generated_summaries[(item['id'], version)] = item['readability_versions'][version]['text']
142
+ # /home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json
143
+ with open("/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json", 'r') as f:
144
+ qwen3_32B_results = json.load(f)
145
+ full_res = []
146
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/completeness_resonability_check_100_qwen3-32B_v4.json"
147
+ import tqdm
148
+ for idx, item in tqdm.tqdm(enumerate(qwen3_32B_results)):
149
+ print(f"Processing item {idx + 1}/{len(qwen3_32B_results)}")
150
+ reference_summary = ref_summaries[item['id']]
151
+ fulltext = fulltexts[item['id']]
152
+ generated_summary = generated_summaries[(item['id'], item['version'])]
153
+ temp_res = []
154
+ for item2 in item['completeness']['results']:
155
+ subclaim_text = item2['subclaim']['subclaim']
156
+ result = item2['result']
157
+ if result =="1":
158
+ continue
159
+ response = infer_reasonableness(
160
+ reference_summary,
161
+ generated_summary,
162
+ item['version'],
163
+ subclaim_text,
164
+ result,
165
+ model_path="/home/mshahidul/readctrl_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3",
166
+ )
167
+ temp_res.append({
168
+ 'id':item2['subclaim']['id'],
169
+ "subclaim": subclaim_text,
170
+ "result": result,
171
+ "reasonableness": response
172
+ })
173
+ full_res.append({
174
+ "id": item['id'],
175
+ "version": item['version'],
176
+ "completeness": {
177
+ "results": temp_res
178
+ }
179
+ })
180
+ if len(full_res)%10==0:
181
+ with open(save_path, 'w') as f:
182
+ json.dump(full_res, f, indent=2, ensure_ascii=False)
183
+
184
+ with open(save_path, 'w') as f:
185
+ json.dump(full_res, f, indent=2, ensure_ascii=False)
186
+
code/finetune-inference/old/completeness_reasoning_v3.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ from openai import OpenAI
4
+ import ast,os
5
+ # ===========================
6
+ # CONFIGURATION
7
+ # ===========================
8
+ MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged"
9
+ VLLM_API_URL = "http://localhost:8004/v1"
10
+ VLLM_API_KEY = "EMPTY"
11
+
12
+ # Initialize Client
13
+ client = OpenAI(
14
+ base_url=VLLM_API_URL,
15
+ api_key=VLLM_API_KEY,
16
+ )
17
+
18
+ # ===========================
19
+ # INFERENCE FUNCTION
20
+ # ===========================
21
+ def infer_reasonableness(
22
+ reference_summary: str,
23
+ generated_summary: str,
24
+ readability_level: str,
25
+ subclaim_text: str,
26
+ result: int,
27
+ ):
28
+ """
29
+ Predict reasonableness using the local vLLM server.
30
+ No error handling: validation or connection errors will raise exceptions.
31
+ """
32
+
33
+ # ---- Build inference prompt ----
34
+ prompt = f"""
35
+ You are an impartial medical summarization evaluator.
36
+
37
+ Goal:
38
+ Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.
39
+
40
+ Readability Criteria:
41
+ - Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
42
+ - Intermediate: for general educated readers; keep main findings but simplify phrasing.
43
+ - Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.
44
+
45
+ Judging rules:
46
+ * Base your decision strictly on what appears in the generated summary.
47
+ * If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
48
+ * If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
49
+ * Stay consistent between `result`, justification, and readability level.
50
+
51
+ ### Inputs
52
+ Readability Level: {readability_level}
53
+ Reference Summary: {reference_summary}
54
+ Generated Summary: {generated_summary}
55
+ Subclaim: "{subclaim_text}"
56
+ Result: {result} # 1 = supported (included), 0 = omitted
57
+
58
+ ### Task
59
+ Respond **only** with the following JSON object:
60
+
61
+ {{
62
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
63
+ "justification": "<short clear explanation>"
64
+ }}
65
+ """.strip()
66
+
67
+ messages = [{"role": "user", "content": prompt}]
68
+
69
+ # ---- Call vLLM Server ----
70
+ response = client.chat.completions.create(
71
+ model=MODEL_NAME,
72
+ messages=messages,
73
+ temperature=0.2,
74
+ max_tokens=200,
75
+ top_p=0.8,
76
+ )
77
+
78
+ output_text = response.choices[0].message.content
79
+
80
+ # ---- Clean Output (Handle Thinking & Markdown) ----
81
+ try:
82
+ if "</think>" in output_text:
83
+ output_text = output_text.split("</think>")[1]
84
+
85
+ clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
86
+ # import ipdb; ipdb.set_trace()
87
+ t=ast.literal_eval(clean_text)
88
+
89
+ # ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
90
+ return t
91
+ except Exception as e:
92
+ return output_text
93
+
94
+
95
+ # ===========================
96
+ # MAIN EXECUTION
97
+ # ===========================
98
+ if __name__ == "__main__":
99
+ import argparse
100
+ parser = argparse.ArgumentParser()
101
+ parser.add_argument("--data_path", type=str, required=True,
102
+ help="Path to the JSON file containing evaluation data.")
103
+ args = parser.parse_args()
104
+ data_path = args.data_path
105
+ # data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
106
+ file_name=os.path.basename(data_path)
107
+
108
+ # Open file directly (Will raise FileNotFoundError if missing)
109
+ with open(data_path, 'r') as f:
110
+ dataset = json.load(f)
111
+
112
+ # print(f"Loaded {len(dataset)} examples. Starting inference...")
113
+ save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}'
114
+ full_results = []
115
+ if os.path.exists(save_path):
116
+ with open(save_path, 'r') as f:
117
+ full_results = json.load(f)
118
+
119
+ import tqdm
120
+ for item in tqdm.tqdm(dataset):
121
+ if any(d['id'] == item['id'] for d in full_results):
122
+ continue
123
+ reference_summary = item['summary']
124
+ temp2={}
125
+ for label in ['easy', 'intermediate', 'hard']:
126
+ generated_summary = item[f'{label}_text']
127
+ subclaim_list = item['metrics'][f'{label}']['completeness']['details']
128
+ temp=[]
129
+ for idx, subclaim in enumerate(subclaim_list):
130
+
131
+ # Check status (assumes subclaim variable holds the status string)
132
+ result = 1 if subclaim['label'] == 'supported' else 0
133
+
134
+ if result ==0:
135
+ output = infer_reasonableness(
136
+ reference_summary=reference_summary,
137
+ generated_summary=generated_summary,
138
+ readability_level=label,
139
+ subclaim_text=subclaim['subclaim'],
140
+ result=result,
141
+ )
142
+
143
+ temp.append({
144
+ 'subclaim': subclaim['subclaim'],
145
+ 'output': output
146
+ })
147
+ else:
148
+ temp.append({
149
+ 'subclaim': subclaim['subclaim'],
150
+ 'output': {
151
+ 'reasonableness': 'reasonable',
152
+ 'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
153
+ }
154
+ })
155
+
156
+ temp2[label] = {
157
+ 'results': temp
158
+ }
159
+ full_results.append({
160
+ 'id': item['id'],
161
+ 'completeness': temp2
162
+ })
163
+ if len(full_results) % 10 == 0:
164
+ with open(save_path, 'w') as f:
165
+ json.dump(full_results, f, indent=2, ensure_ascii=False)
166
+
167
+ with open(save_path, 'w') as f:
168
+ json.dump(full_results, f, indent=2, ensure_ascii=False)
169
+
170
+
171
+
code/finetune-inference/old/extracting_subclaims.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"
11
+ API_URL = "http://localhost:8015/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # SUBCLAIM EXTRACTION PROMPT
18
+ # -----------------------------
19
+ def extraction_prompt(medical_text: str) -> str:
20
+ return f"""
21
+ You are an expert medical annotator. Extract granular, factual subclaims.
22
+ A subclaim is the smallest standalone factual unit that can be independently verified.
23
+
24
+ Rules:
25
+ - Use only information explicitly present in the text.
26
+ - Do not infer or hallucinate.
27
+ - Subclaims must be atomic and factual.
28
+ - Return ONLY a JSON list of strings.
29
+
30
+ Medical Text:
31
+ {medical_text}
32
+
33
+ Return output as:
34
+ [
35
+ "subclaim 1",
36
+ "subclaim 2",
37
+ ...
38
+ ]
39
+ """
40
+
41
+ # -----------------------------
42
+ # INFERENCE FUNCTION
43
+ # -----------------------------
44
+ def infer_subclaims(medical_text: str, temperature: float = 0.2) -> list:
45
+ if not medical_text or medical_text.strip() == "":
46
+ return []
47
+
48
+ final_prompt = extraction_prompt(medical_text)
49
+
50
+ try:
51
+ response = client.chat.completions.create(
52
+ model=MODEL_NAME,
53
+ messages=[{"role": "user", "content": final_prompt}],
54
+ max_tokens=1000,
55
+ temperature=temperature,
56
+ top_p=0.9,
57
+ )
58
+ res = response.choices[0].message.content.strip()
59
+ res = res.split("</think>")[-1].strip()
60
+
61
+ # try parse JSON
62
+ try:
63
+ return json.loads(res)
64
+ except:
65
+ return res
66
+
67
+ except Exception as e:
68
+ print(f"API error: {e}")
69
+ return []
70
+
71
+ # -----------------------------
72
+ # MAIN
73
+ # -----------------------------
74
+ if __name__ == "__main__":
75
+ parser = argparse.ArgumentParser()
76
+ parser.add_argument("--file1", type=str, required=True,
77
+ help="Path to synthetic_data_es_raw_592.json")
78
+ parser.add_argument("--file2", type=str, required=True,
79
+ help="Path to multiclinsum_gs_train_es.json")
80
+
81
+ parser.add_argument("--start_index", type=int, default=0,
82
+ help="Start index for processing")
83
+ parser.add_argument("--end_index", type=int, default=-1,
84
+ help="End index for processing (exclusive). -1 = until end")
85
+
86
+ args = parser.parse_args()
87
+
88
+ FILE1 = args.file1
89
+ FILE2 = args.file2
90
+
91
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
92
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
93
+
94
+ # Output filename includes the range
95
+ OUTPUT_FILE = os.path.join(
96
+ SAVE_FOLDER,
97
+ f"extracted_subclaims_{args.start_index}_{args.end_index}.json"
98
+ )
99
+
100
+ # -----------------------------
101
+ # Load files
102
+ # -----------------------------
103
+ print("Loading input files...")
104
+ with open(FILE1, "r") as f:
105
+ file1_data = {x["id"]: x for x in json.load(f)}
106
+
107
+ with open(FILE2, "r") as f:
108
+ file2_data = {x["id"]: x for x in json.load(f)}
109
+
110
+ # -----------------------------
111
+ # Merge and slice by range
112
+ # -----------------------------
113
+ all_ids = sorted(list(set(file1_data.keys()) | set(file2_data.keys())))
114
+
115
+ total_items = len(all_ids)
116
+
117
+ start = args.start_index
118
+ end = args.end_index if args.end_index != -1 else total_items
119
+
120
+ slice_ids = all_ids[start:end]
121
+
122
+ print(f"Total IDs: {total_items}")
123
+ print(f"Processing range: {start} → {end} (count={len(slice_ids)})")
124
+
125
+ # -----------------------------
126
+ # Resume mode
127
+ # -----------------------------
128
+ result = []
129
+ if os.path.exists(OUTPUT_FILE):
130
+ try:
131
+ with open(OUTPUT_FILE, "r") as f:
132
+ result = json.load(f)
133
+ except:
134
+ result = []
135
+
136
+ existing_ids = {r["id"] for r in result}
137
+
138
+ # -----------------------------
139
+ # Process items
140
+ # -----------------------------
141
+ for _id in tqdm.tqdm(slice_ids):
142
+
143
+ if _id in existing_ids:
144
+ continue
145
+
146
+ # FILE1 text
147
+ easy_text = inter_text = hard_text = ""
148
+ if _id in file1_data:
149
+ rv = file1_data[_id]["readability_versions"]
150
+ easy_text = rv.get("easy", {}).get("text", "")
151
+ inter_text = rv.get("intermediate", {}).get("text", "")
152
+ hard_text = rv.get("hard", {}).get("text", "")
153
+
154
+ # FILE2 text
155
+ fulltext = summary = ""
156
+ if _id in file2_data:
157
+ fulltext = file2_data[_id].get("fulltext", "")
158
+ summary = file2_data[_id].get("summary", "")
159
+
160
+ # inference
161
+ easy_sub = infer_subclaims(easy_text)
162
+ inter_sub = infer_subclaims(inter_text)
163
+ hard_sub = infer_subclaims(hard_text)
164
+ fulltext_sub = infer_subclaims(fulltext)
165
+ summary_sub = infer_subclaims(summary)
166
+
167
+ # append
168
+ result.append({
169
+ "id": _id,
170
+
171
+ "easy_text": easy_text,
172
+ "easy_subclaims": easy_sub,
173
+
174
+ "intermediate_text": inter_text,
175
+ "intermediate_subclaims": inter_sub,
176
+
177
+ "hard_text": hard_text,
178
+ "hard_subclaims": hard_sub,
179
+
180
+ "fulltext": fulltext,
181
+ "fulltext_subclaims": fulltext_sub,
182
+
183
+ "summary": summary,
184
+ "summary_subclaims": summary_sub
185
+ })
186
+
187
+ # save frequently
188
+ if len(result) % 20 == 0:
189
+ with open(OUTPUT_FILE, "w") as f:
190
+ json.dump(result, f, indent=4, ensure_ascii=False)
191
+
192
+ # final save
193
+ with open(OUTPUT_FILE, "w") as f:
194
+ json.dump(result, f, indent=4, ensure_ascii=False)
195
+
196
+ print(f"Done! Saved to: {OUTPUT_FILE}")
code/finetune-inference/old/extracting_subclaims_v2.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_NAME = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
11
+ API_URL = "http://localhost:8004/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # SUBCLAIM EXTRACTION PROMPT
18
+ # -----------------------------
19
+ def extraction_prompt(medical_text: str) -> str:
20
+ return f"""
21
+ You are an expert medical annotator. Extract granular, factual subclaims.
22
+ A subclaim is the smallest standalone factual unit that can be independently verified.
23
+
24
+ Instructions:
25
+ 1. Read the provided medical text.
26
+ 2. Break it into clear, objective, atomic subclaims.
27
+ 3. Each subclaim must come directly from the text.
28
+ 4. Do not add, guess, or infer information.
29
+ 5. Each subclaim should be short, specific, and verifiable.
30
+ 6. Return ONLY a Python-style list of strings.
31
+
32
+ Medical Text:
33
+ {medical_text}
34
+
35
+ Return output as:
36
+ [
37
+ "subclaim 1",
38
+ "subclaim 2",
39
+ ...
40
+ ]
41
+ """
42
+
43
+ # -----------------------------
44
+ # INFERENCE FUNCTION
45
+ # -----------------------------
46
+ def infer_subclaims(medical_text: str, temperature: float = 0.2) -> list:
47
+ if not medical_text or medical_text.strip() == "":
48
+ return []
49
+
50
+ final_prompt = extraction_prompt(medical_text)
51
+
52
+ try:
53
+ response = client.chat.completions.create(
54
+ model=MODEL_NAME,
55
+ messages=[{"role": "user", "content": final_prompt}],
56
+ max_tokens=1000,
57
+ temperature=temperature,
58
+ top_p=0.9,
59
+ )
60
+ res = response.choices[0].message.content.strip()
61
+
62
+ # Handle cases where the model might include <think> tags or markdown code blocks
63
+ if "</think>" in res:
64
+ res = res.split("</think>")[-1].strip()
65
+
66
+ if res.startswith("```json"):
67
+ res = res.replace("```json", "").replace("```", "").strip()
68
+
69
+ try:
70
+ return json.loads(res)
71
+ except:
72
+ # Fallback if JSON parsing fails but some text is returned
73
+ return [res]
74
+
75
+ except Exception as e:
76
+ print(f"API error for text snippet: {e}")
77
+ return []
78
+
79
+ # -----------------------------
80
+ # MAIN
81
+ # -----------------------------
82
+ if __name__ == "__main__":
83
+ parser = argparse.ArgumentParser()
84
+ parser.add_argument("--input_file", type=str,
85
+ default="/home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json",
86
+ help="Path to input JSON file")
87
+ parser.add_argument("--start_index", type=int, default=0,
88
+ help="Start index for processing")
89
+ parser.add_argument("--end_index", type=int, default=-1,
90
+ help="End index for processing (exclusive). -1 = until end")
91
+
92
+ args = parser.parse_args()
93
+
94
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
95
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
96
+
97
+ # Output filename based on the source and range
98
+ base_name = os.path.basename(args.input_file).replace(".json", "")
99
+ OUTPUT_FILE = os.path.join(
100
+ SAVE_FOLDER,
101
+ f"subclaims_{base_name}_{args.start_index}_{args.end_index}.json"
102
+ )
103
+
104
+ # -----------------------------
105
+ # Load data
106
+ # -----------------------------
107
+ print(f"Loading {args.input_file}...")
108
+ with open(args.input_file, "r") as f:
109
+ data = json.load(f)
110
+
111
+ total_items = len(data)
112
+ start = args.start_index
113
+ end = args.end_index if args.end_index != -1 else total_items
114
+
115
+ # Slice the data based on arguments
116
+ work_items = data[start:end]
117
+
118
+ print(f"Total records in file: {total_items}")
119
+ print(f"Processing range: {start} → {end} (count={len(work_items)})")
120
+
121
+ # -----------------------------
122
+ # Resume mode
123
+ # -----------------------------
124
+ result = []
125
+ if os.path.exists(OUTPUT_FILE):
126
+ try:
127
+ with open(OUTPUT_FILE, "r") as f:
128
+ result = json.load(f)
129
+ print(f"Resuming from existing file. {len(result)} items already processed.")
130
+ except:
131
+ result = []
132
+
133
+ existing_ids = {r["id"] for r in result}
134
+
135
+ # -----------------------------
136
+ # Process items
137
+ # -----------------------------
138
+ for item in tqdm.tqdm(work_items):
139
+ _id = item.get("id")
140
+
141
+ if _id in existing_ids:
142
+ continue
143
+
144
+ fulltext = item.get("fulltext", "")
145
+ summary = item.get("summary", "")
146
+
147
+ # Run inference for both fields
148
+ fulltext_sub = infer_subclaims(fulltext)
149
+ summary_sub = infer_subclaims(summary)
150
+
151
+ # Build output object
152
+ result.append({
153
+ "id": _id,
154
+ "fulltext": fulltext,
155
+ "fulltext_subclaims": fulltext_sub,
156
+ "summary": summary,
157
+ "summary_subclaims": summary_sub,
158
+ "readability_score": item.get("readability_score", None)
159
+ })
160
+
161
+ # Periodic save to prevent data loss
162
+ if len(result) % 10 == 0:
163
+ with open(OUTPUT_FILE, "w") as f:
164
+ json.dump(result, f, indent=4, ensure_ascii=False)
165
+
166
+ # Final save
167
+ with open(OUTPUT_FILE, "w") as f:
168
+ json.dump(result, f, indent=4, ensure_ascii=False)
169
+
170
+ print(f"Success! Results saved to: {OUTPUT_FILE}")
code/finetune-inference/old/extracting_subclaims_v3.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_NAME = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
11
+ API_URL = "http://localhost:8004/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # SUBCLAIM EXTRACTION PROMPT
18
+ # -----------------------------
19
+ def extraction_prompt(medical_text: str) -> str:
20
+ return f"""
21
+ You are an expert medical annotator. Extract granular, factual subclaims.
22
+ A subclaim is the smallest standalone factual unit that can be independently verified.
23
+
24
+ Instructions:
25
+ 1. Read the provided medical text.
26
+ 2. Break it into clear, objective, atomic subclaims.
27
+ 3. Each subclaim must come directly from the text.
28
+ 4. Do not add, guess, or infer information.
29
+ 5. Each subclaim should be short, specific, and verifiable.
30
+ 6. Return ONLY a Python-style list of strings.
31
+
32
+ Medical Text:
33
+ {medical_text}
34
+
35
+ Return output as:
36
+ [
37
+ "subclaim 1",
38
+ "subclaim 2",
39
+ ...
40
+ ]
41
+ """
42
+
43
+ # -----------------------------
44
+ # INFERENCE FUNCTION
45
+ # -----------------------------
46
+ def infer_subclaims(medical_text: str, temperature: float = 0.2) -> list:
47
+ if not medical_text or medical_text.strip() == "":
48
+ return []
49
+
50
+ final_prompt = extraction_prompt(medical_text)
51
+
52
+ try:
53
+ response = client.chat.completions.create(
54
+ model=MODEL_NAME,
55
+ messages=[{"role": "user", "content": final_prompt}],
56
+ max_tokens=1000,
57
+ temperature=temperature,
58
+ top_p=0.9,
59
+ )
60
+ res = response.choices[0].message.content.strip()
61
+
62
+ # Handle cases where the model might include <think> tags or markdown code blocks
63
+ if "</think>" in res:
64
+ res = res.split("</think>")[-1].strip()
65
+
66
+ if res.startswith("```json"):
67
+ res = res.replace("```json", "").replace("```", "").strip()
68
+
69
+ try:
70
+ return json.loads(res)
71
+ except:
72
+ # Fallback if JSON parsing fails but some text is returned
73
+ return [res]
74
+
75
+ except Exception as e:
76
+ print(f"API error for text snippet: {e}")
77
+ return []
78
+
79
+
80
+ # ... (Configuration and extraction_prompt remain the same) ...
81
+
82
+ # -----------------------------
83
+ # MAIN
84
+ # -----------------------------
85
+ if __name__ == "__main__":
86
+ parser = argparse.ArgumentParser()
87
+ parser.add_argument("--input_file", type=str,
88
+ default="/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en.json",
89
+ help="Path to input JSON file")
90
+ parser.add_argument("--start_index", type=int, default=0,
91
+ help="Start index for processing")
92
+ parser.add_argument("--end_index", type=int, default=-1,
93
+ help="End index for processing (exclusive). -1 = until end")
94
+
95
+ args = parser.parse_args()
96
+
97
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
98
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
99
+
100
+ base_name = os.path.basename(args.input_file).replace(".json", "")
101
+ OUTPUT_FILE = os.path.join(
102
+ SAVE_FOLDER,
103
+ f"subclaims_with_generated_{base_name}_{args.start_index}_{args.end_index}.json"
104
+ )
105
+
106
+ print(f"Loading {args.input_file}...")
107
+ with open(args.input_file, "r") as f:
108
+ data = json.load(f)
109
+
110
+ total_items = len(data)
111
+ start = args.start_index
112
+ end = args.end_index if args.end_index != -1 else total_items
113
+ work_items = data[start:end]
114
+
115
+ result = []
116
+ if os.path.exists(OUTPUT_FILE):
117
+ try:
118
+ with open(OUTPUT_FILE, "r") as f:
119
+ result = json.load(f)
120
+ print(f"Resuming. {len(result)} items already processed.")
121
+ except:
122
+ result = []
123
+
124
+ # Using "index" or "id" as the unique identifier based on your JSON snippet
125
+ existing_ids = {r.get("index") or r.get("id") for r in result}
126
+
127
+ for item in tqdm.tqdm(work_items):
128
+ # Handle different ID key names
129
+ curr_id = item.get("index") if item.get("index") is not None else item.get("id")
130
+
131
+ if curr_id in existing_ids:
132
+ continue
133
+
134
+ # 1. Process standard fields
135
+ fulltext = item.get("fulltext", "")
136
+ summary = item.get("summary", "")
137
+
138
+ fulltext_sub = infer_subclaims(fulltext)
139
+ summary_sub = infer_subclaims(summary)
140
+
141
+ # 2. Process all generated texts (diff_label_texts)
142
+ # We will create a mirror dictionary to store the subclaims
143
+ diff_label_subclaims = {}
144
+ generated_texts = item.get("diff_label_texts", {})
145
+
146
+ for label, text in generated_texts.items():
147
+ if text:
148
+ diff_label_subclaims[label] = infer_subclaims(text)
149
+ else:
150
+ diff_label_subclaims[label] = []
151
+
152
+ # 3. Build output object
153
+ output_item = {
154
+ "index": curr_id,
155
+ "fulltext": fulltext,
156
+ "fulltext_subclaims": fulltext_sub,
157
+ "summary": summary,
158
+ "summary_subclaims": summary_sub,
159
+ "diff_label_texts": generated_texts,
160
+ "diff_label_subclaims": diff_label_subclaims, # New field
161
+ "readability_score": item.get("readability_score", None)
162
+ }
163
+
164
+ result.append(output_item)
165
+
166
+ # Periodic save
167
+ if len(result) % 10 == 0:
168
+ with open(OUTPUT_FILE, "w") as f:
169
+ json.dump(result, f, indent=4, ensure_ascii=False)
170
+
171
+ # Final save
172
+ with open(OUTPUT_FILE, "w") as f:
173
+ json.dump(result, f, indent=4, ensure_ascii=False)
174
+
175
+ print(f"Success! Results saved to: {OUTPUT_FILE}")
code/finetune-inference/old/inference.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ import sys
5
+ sys.path.append(os.path.abspath('/home/mshahidul/'))
6
+ from gpu_selection import _gpu_selection_
7
+ # 1. Argparse for path
8
+ parser = argparse.ArgumentParser(description="Translation Evaluation")
9
+ parser.add_argument("--path", type=str, default="/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json", help="Path to the JSON file")
10
+ parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs")
11
+ args = parser.parse_args()
12
+
13
+ if args.cuda is not None:
14
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
15
+ print(f"🎮🎮 Using CUDA device: {args.cuda}")
16
+ else:
17
+ _gpu_selection_()
18
+
19
+ # 2. Output directory and file
20
+ out_dir = "/home/mshahidul/readctrl/results/"
21
+ os.makedirs(os.path.dirname(out_dir), exist_ok=True)
22
+ file_name = os.path.basename(args.path)
23
+ out_path = os.path.join(out_dir, file_name)
24
+
25
+ # 3. Load already evaluated results if exist
26
+ results = []
27
+ completed_keys = set()
28
+ if os.path.exists(out_path):
29
+ with open(out_path, "r", encoding="utf-8") as f:
30
+ results = json.load(f)
31
+ for r in results:
32
+ completed_keys.add((r["article"], r["gold_summary"]))
33
+
34
+ # 4. Load dataset
35
+ with open(args.path, "r", encoding="utf-8") as f:
36
+ dataset = json.load(f)
37
+ from unsloth import FastLanguageModel
38
+ import torch
39
+ # 5. Load model
40
+ model, tokenizer = FastLanguageModel.from_pretrained(
41
+ model_name = "/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v1",
42
+ max_seq_length = 4092,
43
+ load_in_4bit = True,
44
+ load_in_8bit = False,
45
+ full_finetuning = False,
46
+ )
47
+ from prompt_generate import generate_prompt
48
+ # 6. Evaluation loop
49
+ import tqdm
50
+ for item in tqdm.tqdm(dataset):
51
+ key = (item["article"], item["gold_summary"])
52
+ if key in completed_keys:
53
+ continue
54
+
55
+ for band in ["B1", "B2", "B3"]:
56
+ prompt = generate_prompt(item['article'],item['gold_summary'],band,"es")
57
+
58
+ messages = [{"role": "user", "content": prompt+"\n"}]
59
+ text = tokenizer.apply_chat_template(
60
+ messages,
61
+ tokenize=False,
62
+ add_generation_prompt=True,
63
+ enable_thinking=False,
64
+ )
65
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
66
+ output_ids = model.generate(
67
+ **inputs,
68
+ max_new_tokens=1000,
69
+ temperature=0.1,
70
+ top_p=0.8,
71
+ top_k=5,
72
+ )
73
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
74
+ #answer = output_text.split("</think>")[1].strip()
75
+
76
+ results.append({
77
+ "article": item["article"],
78
+ "gold_summary": item["gold_summary"],
79
+ "band": band,
80
+ "lang": "es",
81
+ "synthetic_summary": output_text,
82
+ })
83
+ completed_keys.add(key)
84
+ # Save every 30 results
85
+ if len(results) % 30 == 0:
86
+ with open(out_path, "w", encoding="utf-8") as f:
87
+ json.dump(results, f, ensure_ascii=False, indent=2)
88
+
89
+ # 7. Final save
90
+ with open(out_path, "w", encoding="utf-8") as f:
91
+ json.dump(results, f, ensure_ascii=False, indent=2)
code/finetune-inference/old/inferenceV2_without_context.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ import sys
5
+ sys.path.append(os.path.abspath('/home/mshahidul/'))
6
+ from gpu_selection import _gpu_selection_
7
+ # 1. Argparse for path
8
+ parser = argparse.ArgumentParser(description="Translation Evaluation")
9
+ # parser.add_argument("--out_path", type=str, default="/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json", help="Path to the JSON file")
10
+ parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs")
11
+ parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2", help="Path to the finetuned model")
12
+ parser.add_argument("--temperature", type=float, default=0.1, help="Generation temperature")
13
+ args = parser.parse_args()
14
+ # out_path = args.out_path
15
+ model_name = args.model_name
16
+ temperature = args.temperature
17
+ if args.cuda is not None:
18
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
19
+ print(f"🎮🎮 Using CUDA device: {args.cuda}")
20
+ else:
21
+ _gpu_selection_()
22
+
23
+ prompts={
24
+ "easy":'''
25
+ You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.
26
+ Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).
27
+ Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.
28
+ Keep all important factual details, but remove jargon.
29
+ Return only the rewritten text without commentary.
30
+ ''',
31
+
32
+ 'intermediate':'''
33
+ You are an assistant specialized in rewriting Spanish texts with medium readability.
34
+ Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).
35
+ Use clear and complete sentences, moderately complex vocabulary, and structured narration.
36
+ Retain all relevant medical or factual information, but phrase it in accessible language.
37
+ Return only the rewritten text with no explanations.
38
+ ''',
39
+
40
+ 'hard':'''
41
+ You are an assistant that rewrites Spanish medical texts with professional, technical precision.
42
+ Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.
43
+ The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).
44
+ Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.
45
+ Return only the rewritten text.
46
+ '''
47
+ }
48
+
49
+ # 2. Output directory and file
50
+ path="/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
51
+ out_dir = "/home/mshahidul/readctrl/results/v2_without_context"
52
+ os.makedirs(out_dir, exist_ok=True)
53
+ # file_name = os.path.basename(path)
54
+ # out_path = os.path.join(out_dir, file_name.replace(".json", "_V2.json"))
55
+ # os.makedirs(os.path.dirname(out_dir), exist_ok=True)
56
+ if os.path.exists(model_name):
57
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned.json"
58
+ else:
59
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_base.json"
60
+ # 3. Load already evaluated results if exist
61
+ results = []
62
+ completed_keys = set()
63
+ if os.path.exists(out_path):
64
+ with open(out_path, "r", encoding="utf-8") as f:
65
+ results = json.load(f)
66
+ for r in results:
67
+ completed_keys.add(r["fulltext"])
68
+
69
+ # 4. Load dataset
70
+ with open(path, "r", encoding="utf-8") as f:
71
+ dataset = json.load(f)
72
+ dataset=dataset[0:50]
73
+ from unsloth import FastLanguageModel
74
+ import torch
75
+ # 5. Load model
76
+ model, tokenizer = FastLanguageModel.from_pretrained(
77
+ model_name = model_name,
78
+ max_seq_length = 4092,
79
+ load_in_4bit = False,
80
+ load_in_8bit = False,
81
+ full_finetuning = False,
82
+ )
83
+
84
+ import tqdm
85
+ for item in tqdm.tqdm(dataset):
86
+ key = item["fulltext"]
87
+ if key in completed_keys:
88
+ continue
89
+
90
+ for band in ["easy", "intermediate", "hard"]:
91
+ prompt = prompts[band]+'\n\n'+"Input text:\n"+item['fulltext']
92
+
93
+ # messages = [{"role": "user", "content": prompt+"\n"}]
94
+ messages = [
95
+ {"role": "system", "content": prompts[band].strip()},
96
+ {"role": "user", "content": "Input text:\n" + item["fulltext"].strip()}
97
+ ]
98
+ text = tokenizer.apply_chat_template(
99
+ messages,
100
+ tokenize=False,
101
+ add_generation_prompt=True,
102
+ enable_thinking=False,
103
+ )
104
+ # input_ids = tokenizer(item["fulltext"], return_tensors="pt").input_ids
105
+ # input_len = input_ids.shape[1]
106
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
107
+ input_len = inputs.input_ids.shape[1]
108
+ # Define proportional multipliers for each readability level
109
+ length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
110
+
111
+ # Compute adaptive max_new_tokens
112
+ max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))
113
+ output_ids = model.generate(
114
+ **inputs,
115
+ max_new_tokens=max_new_tokens,
116
+ temperature=temperature,
117
+ top_p=0.9,
118
+ top_k=45,
119
+ )
120
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
121
+ #answer = output_text.split("</think>")[1].strip()
122
+
123
+ results.append({
124
+ "fulltext": item["fulltext"],
125
+ "band": band,
126
+ "lang": "es",
127
+ "synthetic_summary": output_text,
128
+ })
129
+ completed_keys.add(key)
130
+ # Save every 10 results
131
+ if len(results) % 3 == 0:
132
+ with open(out_path, "w", encoding="utf-8") as f:
133
+ json.dump(results, f, ensure_ascii=False, indent=2)
134
+
135
+ # 7. Final save
136
+ with open(out_path, "w", encoding="utf-8") as f:
137
+ json.dump(results, f, ensure_ascii=False, indent=2)
code/finetune-inference/old/inferenceV3.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ import sys
5
+ sys.path.append(os.path.abspath('/home/mshahidul/'))
6
+ from gpu_selection import _gpu_selection_
7
+
8
+ parser = argparse.ArgumentParser(description="Readability Controlled Generation")
9
+ parser.add_argument("--cuda", type=str, default="3")
10
+ parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
11
+ parser.add_argument("--temperature", type=float, default=0.1)
12
+ args = parser.parse_args()
13
+
14
+ model_name = args.model_name
15
+ temperature = args.temperature
16
+
17
+ if args.cuda is not None:
18
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
19
+ print(f"🎮🎮 Using CUDA device: {args.cuda}")
20
+ else:
21
+ _gpu_selection_()
22
+
23
+ prompts = {
24
+ "easy": '''
25
+ You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.
26
+ Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).
27
+ Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.
28
+ Keep all important factual details, but remove jargon.
29
+ Return only the rewritten text without commentary.
30
+ ''',
31
+ "intermediate": '''
32
+ You are an assistant specialized in rewriting Spanish texts with medium readability.
33
+ Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).
34
+ Use clear and complete sentences, moderately complex vocabulary, and structured narration.
35
+ Retain all relevant medical or factual information, but phrase it in accessible language.
36
+ Return only the rewritten text with no explanations.
37
+ ''',
38
+ "hard": '''
39
+ You are an assistant that rewrites Spanish medical texts with professional, technical precision.
40
+ Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.
41
+ The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).
42
+ Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.
43
+ Return only the rewritten text.
44
+ '''
45
+ }
46
+
47
+ # -------- New Part: Load keyword–definition dataset ----------
48
+ kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
49
+ with open(kw_file, "r", encoding="utf-8") as f:
50
+ definitions_data = json.load(f)
51
+
52
+ # Build quick lookup: id -> glossary text
53
+ def_map = {}
54
+ for obj in definitions_data:
55
+ cid = obj.get("id")
56
+ kwlist = obj.get("medical_keywords", [])
57
+ defs_str = ""
58
+ if kwlist:
59
+ defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
60
+ defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
61
+ def_map[cid] = defs_str
62
+ # --------------------------------------------------------------
63
+
64
+ path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
65
+ out_dir = "/home/mshahidul/readctrl/results/v3_context"
66
+ os.makedirs(out_dir, exist_ok=True)
67
+
68
+ if os.path.exists(model_name):
69
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
70
+ else:
71
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"
72
+
73
+ results, completed_keys = [], set()
74
+ if os.path.exists(out_path):
75
+ with open(out_path, "r", encoding="utf-8") as f:
76
+ results = json.load(f)
77
+ for r in results:
78
+ completed_keys.add(r["fulltext"])
79
+
80
+ # -------- Load main dataset -----------
81
+ with open(path, "r", encoding="utf-8") as f:
82
+ dataset = json.load(f)
83
+ dataset = dataset[0:50]
84
+
85
+ from unsloth import FastLanguageModel
86
+ import torch
87
+
88
+ model, tokenizer = FastLanguageModel.from_pretrained(
89
+ model_name=model_name,
90
+ max_seq_length=4092,
91
+ load_in_4bit=False,
92
+ load_in_8bit=False,
93
+ full_finetuning=False,
94
+ )
95
+
96
+ import tqdm
97
+ for item in tqdm.tqdm(dataset):
98
+ key = item["fulltext"]
99
+ if key in completed_keys:
100
+ continue
101
+ item_id = item["id"]
102
+ glossary = def_map.get(item_id, "") # retrieve glossary if exists
103
+
104
+ for band in ["easy", "intermediate", "hard"]:
105
+ # Append definitions below the case text
106
+ user_content = f"Input text:\n{item['fulltext'].strip()}"
107
+ if glossary:
108
+ user_content += "\n\n" + glossary
109
+
110
+ messages = [
111
+ {"role": "system", "content": prompts[band].strip()},
112
+ {"role": "user", "content": user_content}
113
+ ]
114
+
115
+ text = tokenizer.apply_chat_template(
116
+ messages,
117
+ tokenize=False,
118
+ add_generation_prompt=True,
119
+ enable_thinking=False,
120
+ )
121
+
122
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
123
+ input_len = inputs.input_ids.shape[1]
124
+ length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
125
+ max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))
126
+
127
+ output_ids = model.generate(
128
+ **inputs,
129
+ max_new_tokens=max_new_tokens,
130
+ temperature=temperature,
131
+ top_p=0.9,
132
+ top_k=45,
133
+ )
134
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
135
+
136
+ results.append({
137
+ "id": item_id,
138
+ "fulltext": item["fulltext"],
139
+ "band": band,
140
+ "lang": "es",
141
+ "synthetic_summary": output_text,
142
+ "definitions_used": bool(glossary) # track whether glossary applied
143
+ })
144
+
145
+ completed_keys.add(key)
146
+ if len(results) % 3 == 0:
147
+ with open(out_path, "w", encoding="utf-8") as f:
148
+ json.dump(results, f, ensure_ascii=False, indent=2)
149
+
150
+ with open(out_path, "w", encoding="utf-8") as f:
151
+ json.dump(results, f, ensure_ascii=False, indent=2)
152
+
153
+
154
+ from notifier import send_notification
155
+ send_notification(
156
+ "process-complete1507034",
157
+ f"Finished inference with model {model_name} at temperature {temperature}. Results saved to {out_path}",
158
+ title="Inference Complete",
159
+ priority="default",
160
+ tags="tada"
161
+ )
code/finetune-inference/old/inferenceV3_temp.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ import sys
5
+
6
+
7
+ parser = argparse.ArgumentParser(description="Readability Controlled Generation")
8
+ parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
9
+ parser.add_argument("--temperature", type=float, default=0.1)
10
+ args = parser.parse_args()
11
+
12
+ model_name = args.model_name
13
+ temperature = args.temperature
14
+
15
+
16
+ prompts = {
17
+ "easy": '''
18
+ You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.
19
+ Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).
20
+ Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.
21
+ Keep all important factual details, but remove jargon.
22
+ Return only the rewritten text without commentary.
23
+ ''',
24
+ "intermediate": '''
25
+ You are an assistant specialized in rewriting Spanish texts with medium readability.
26
+ Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).
27
+ Use clear and complete sentences, moderately complex vocabulary, and structured narration.
28
+ Retain all relevant medical or factual information, but phrase it in accessible language.
29
+ Return only the rewritten text with no explanations.
30
+ ''',
31
+ "hard": '''
32
+ You are an assistant that rewrites Spanish medical texts with professional, technical precision.
33
+ Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.
34
+ The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).
35
+ Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.
36
+ Return only the rewritten text.
37
+ '''
38
+ }
39
+
40
+ # -------- New Part: Load keyword–definition dataset ----------
41
+ kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
42
+ with open(kw_file, "r", encoding="utf-8") as f:
43
+ definitions_data = json.load(f)
44
+
45
+ # Build quick lookup: id -> glossary text
46
+ def_map = {}
47
+ for obj in definitions_data:
48
+ cid = obj.get("id")
49
+ kwlist = obj.get("medical_keywords", [])
50
+ defs_str = ""
51
+ if kwlist:
52
+ defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
53
+ defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
54
+ def_map[cid] = defs_str
55
+ # --------------------------------------------------------------
56
+
57
+ path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
58
+ out_dir = "/home/mshahidul/readctrl/results/v3"
59
+ os.makedirs(out_dir, exist_ok=True)
60
+
61
+ if os.path.exists(model_name):
62
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
63
+ else:
64
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"
65
+
66
+ results, completed_keys = [], set()
67
+ if os.path.exists(out_path):
68
+ with open(out_path, "r", encoding="utf-8") as f:
69
+ results = json.load(f)
70
+ for r in results:
71
+ completed_keys.add(r["fulltext"])
72
+
73
+ # -------- Load main dataset -----------
74
+ with open(path, "r", encoding="utf-8") as f:
75
+ dataset = json.load(f)
76
+ dataset = dataset[0:50]
77
+
78
+ from unsloth import FastLanguageModel
79
+ import torch
80
+
81
+ model, tokenizer = FastLanguageModel.from_pretrained(
82
+ model_name=model_name,
83
+ max_seq_length=4092,
84
+ load_in_4bit=False,
85
+ load_in_8bit=False,
86
+ full_finetuning=False,
87
+ )
88
+
89
+ import tqdm
90
+ for item in tqdm.tqdm(dataset):
91
+ key = item["fulltext"]
92
+ if key in completed_keys:
93
+ continue
94
+ item_id = item["id"]
95
+ glossary = def_map.get(item_id, "") # retrieve glossary if exists
96
+
97
+ for band in ["easy", "intermediate", "hard"]:
98
+ # Append definitions below the case text
99
+ user_content = f"Input text:\n{item['fulltext'].strip()}"
100
+ if glossary:
101
+ user_content += "\n\n" + glossary
102
+
103
+ messages = [
104
+ {"role": "system", "content": prompts[band].strip()},
105
+ {"role": "user", "content": user_content}
106
+ ]
107
+
108
+ text = tokenizer.apply_chat_template(
109
+ messages,
110
+ tokenize=False,
111
+ add_generation_prompt=True,
112
+ enable_thinking=False,
113
+ )
114
+
115
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
116
+ input_len = inputs.input_ids.shape[1]
117
+ length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
118
+ max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))
119
+
120
+ output_ids = model.generate(
121
+ **inputs,
122
+ max_new_tokens=max_new_tokens,
123
+ temperature=temperature,
124
+ top_p=0.9,
125
+ top_k=45,
126
+ )
127
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
128
+
129
+ results.append({
130
+ "id": item_id,
131
+ "fulltext": item["fulltext"],
132
+ "band": band,
133
+ "lang": "es",
134
+ "synthetic_summary": output_text,
135
+ "definitions_used": bool(glossary) # track whether glossary applied
136
+ })
137
+
138
+ completed_keys.add(key)
139
+ if len(results) % 3 == 0:
140
+ with open(out_path, "w", encoding="utf-8") as f:
141
+ json.dump(results, f, ensure_ascii=False, indent=2)
142
+
143
+ with open(out_path, "w", encoding="utf-8") as f:
144
+ json.dump(results, f, ensure_ascii=False, indent=2)
code/finetune-inference/old/inferenceV4.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ import sys
5
+ sys.path.append(os.path.abspath('/home/mshahidul/'))
6
+ from gpu_selection import _gpu_selection_
7
+
8
+ parser = argparse.ArgumentParser(description="Readability Controlled Generation")
9
+ parser.add_argument("--cuda", type=str, default="3")
10
+ parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
11
+ parser.add_argument("--temperature", type=float, default=0.1)
12
+ args = parser.parse_args()
13
+
14
+ model_name = args.model_name
15
+ temperature = args.temperature
16
+
17
+ if args.cuda is not None:
18
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
19
+ print(f"🎮🎮 Using CUDA device: {args.cuda}")
20
+ else:
21
+ _gpu_selection_()
22
+
23
+ prompts={
24
+ "easy":'''
25
+ Reescribe el siguiente informe médico en español con un nivel de lectura fácil correspondiente a un puntaje FH entre 70 y 100 (texto muy comprensible).
26
+ Usa oraciones cortas y directas, vocabulario cotidiano, estructuras simples y explicaciones claras de términos médicos. El tono debe ser empático y accesible, como si estuvieras explicando la situación a un paciente o familiar sin conocimientos médicos.
27
+ Mantén los datos clínicos y resultados esenciales, pero reemplaza o aclara tecnicismos con frases simples. Evita abreviaturas o siglas sin explicación.
28
+ ''',
29
+ "intermediate": '''
30
+ Reformula el siguiente informe médico en español con un nivel de lectura intermedio, correspondiente a un puntaje FH entre 50 y 70 (texto de dificultad moderada).
31
+ Usa lenguaje formal pero comprensible, adecuado para lectores con educación general o estudiantes del área de salud. Mantén la precisión médica, pero agrega explicaciones breves tras los términos técnicos. Alterna oraciones simples y compuestas, con buena fluidez y cohesión.
32
+ El texto debe sonar profesional, informativo y claro, sin llegar a la densidad típica de lenguaje técnico especializado.
33
+ ''',
34
+ "hard": '''
35
+ Reescribe el siguiente informe médico en español con un nivel de lectura avanzado o técnico, correspondiente a un puntaje FH entre 0 y 50 (texto especializado).
36
+ Usa terminología médica precisa, estructuras sintácticas complejas y tono formal típico de documentos clínicos o publicaciones científicas. No simplifiques ni expliques los tecnicismos; conserva la exactitud conceptual y la nomenclatura profesional.
37
+ Refleja el razonamiento clínico, hallazgos y juicios médicos con lenguaje apropiado para médicos, especialistas o investigadores.
38
+ '''
39
+ }
40
+ # -------- New Part: Load keyword–definition dataset ----------
41
+ kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
42
+ with open(kw_file, "r", encoding="utf-8") as f:
43
+ definitions_data = json.load(f)
44
+
45
+ # Build quick lookup: id -> glossary text
46
+ def_map = {}
47
+ for obj in definitions_data:
48
+ cid = obj.get("id")
49
+ kwlist = obj.get("medical_keywords", [])
50
+ defs_str = ""
51
+ if kwlist:
52
+ defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
53
+ defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
54
+ def_map[cid] = defs_str
55
+ # --------------------------------------------------------------
56
+
57
+ path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
58
+ out_dir = "/home/mshahidul/readctrl/results/custom_promptsV1"
59
+ os.makedirs(out_dir, exist_ok=True)
60
+
61
+ if os.path.exists(model_name):
62
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
63
+ else:
64
+ out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"
65
+
66
+ results, completed_keys = [], set()
67
+ if os.path.exists(out_path):
68
+ with open(out_path, "r", encoding="utf-8") as f:
69
+ results = json.load(f)
70
+ for r in results:
71
+ completed_keys.add(r["fulltext"])
72
+
73
+ # -------- Load main dataset -----------
74
+ with open(path, "r", encoding="utf-8") as f:
75
+ dataset = json.load(f)
76
+ dataset = dataset[0:50]
77
+
78
+ from unsloth import FastLanguageModel
79
+ import torch
80
+
81
+ model, tokenizer = FastLanguageModel.from_pretrained(
82
+ model_name=model_name,
83
+ max_seq_length=4092,
84
+ load_in_4bit=False,
85
+ load_in_8bit=False,
86
+ full_finetuning=False,
87
+ )
88
+
89
+ import tqdm
90
+ for item in tqdm.tqdm(dataset):
91
+ key = item["fulltext"]
92
+ if key in completed_keys:
93
+ continue
94
+ item_id = item["id"]
95
+ glossary = def_map.get(item_id, "") # retrieve glossary if exists
96
+
97
+ for band in ["easy", "intermediate", "hard"]:
98
+ # Append definitions below the case text
99
+ user_content = f"Input text:\n{item['fulltext'].strip()}"
100
+ # if glossary:
101
+ # user_content += "\n\n" + glossary
102
+
103
+ messages = [
104
+ {"role": "system", "content": prompts[band].strip()},
105
+ {"role": "user", "content": user_content}
106
+ ]
107
+
108
+ text = tokenizer.apply_chat_template(
109
+ messages,
110
+ tokenize=False,
111
+ add_generation_prompt=True,
112
+ enable_thinking=False,
113
+ )
114
+
115
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
116
+ input_len = inputs.input_ids.shape[1]
117
+ length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
118
+ max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))
119
+
120
+ output_ids = model.generate(
121
+ **inputs,
122
+ max_new_tokens=max_new_tokens,
123
+ temperature=temperature,
124
+ top_p=0.9,
125
+ top_k=45,
126
+ )
127
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
128
+
129
+ results.append({
130
+ "id": item_id,
131
+ "fulltext": item["fulltext"],
132
+ "band": band,
133
+ "lang": "es",
134
+ "synthetic_summary": output_text,
135
+ "definitions_used": bool(glossary) # track whether glossary applied
136
+ })
137
+
138
+ completed_keys.add(key)
139
+ if len(results) % 3 == 0:
140
+ with open(out_path, "w", encoding="utf-8") as f:
141
+ json.dump(results, f, ensure_ascii=False, indent=2)
142
+
143
+ with open(out_path, "w", encoding="utf-8") as f:
144
+ json.dump(results, f, ensure_ascii=False, indent=2)
145
+
146
+
147
+ from notifier import send_notification
148
+ send_notification(
149
+ "process-complete1507034",
150
+ f"Finished inference with model {model_name} at temperature {temperature}. Results saved to {out_path}",
151
+ title="Inference Complete",
152
+ priority="default",
153
+ tags="tada"
154
+ )
code/finetune-inference/old/inference_extract_subclaims.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
4
+
5
+ import torch
6
+ from unsloth import FastLanguageModel
7
+ import json
8
+ import tqdm
9
+
10
+ # -----------------------------
11
+ # MODEL CACHE
12
+ # -----------------------------
13
+ _model_cache = {"model": None, "tokenizer": None}
14
+
15
+ def load_finetuned_model(model_path: str):
16
+ """Load and cache your fine-tuned subclaim extraction model + tokenizer."""
17
+ if _model_cache["model"] is not None:
18
+ return _model_cache["model"], _model_cache["tokenizer"]
19
+
20
+ model, tokenizer = FastLanguageModel.from_pretrained(
21
+ model_name=model_path,
22
+ max_seq_length=8192,
23
+ load_in_4bit=False,
24
+ load_in_8bit=False,
25
+ full_finetuning=False,
26
+ )
27
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
28
+ return model, tokenizer
29
+
30
+
31
+ # -----------------------------
32
+ # SUBCLAIM EXTRACTION PROMPT
33
+ # -----------------------------
34
+ def extraction_prompt(medical_text: str) -> str:
35
+ prompt = f"""
36
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
37
+ A subclaim is the smallest standalone factual unit that can be independently verified.
38
+
39
+ Instructions:
40
+ 1. Read the provided medical text.
41
+ 2. Break it into clear, objective, atomic subclaims.
42
+ 3. Each subclaim must come directly from the text.
43
+ 4. Do not add, guess, or infer information.
44
+ 5. Each subclaim should be short, specific, and verifiable.
45
+ 6. Return ONLY a Python-style list of strings.
46
+
47
+ Medical Text:
48
+ {medical_text}
49
+
50
+ Return your output in JSON list format, like:
51
+ [
52
+ "subclaim 1",
53
+ "subclaim 2",
54
+ ...
55
+ ]
56
+ """
57
+ return prompt
58
+
59
+
60
+ # -----------------------------
61
+ # INFERENCE FUNCTION
62
+ # -----------------------------
63
+ def infer_subclaims(medical_text: str,
64
+ model_path: str,
65
+ temperature: float = 0.2) -> str:
66
+
67
+ model, tokenizer = load_finetuned_model(model_path)
68
+
69
+ prompt = extraction_prompt(medical_text)
70
+
71
+ messages = [{"role": "user", "content": prompt}]
72
+
73
+ chat_text = tokenizer.apply_chat_template(
74
+ messages,
75
+ tokenize=False,
76
+ add_generation_prompt=True,
77
+ enable_thinking=False,
78
+ )
79
+
80
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
81
+
82
+ with torch.no_grad():
83
+ output_ids = model.generate(
84
+ **inputs,
85
+ max_new_tokens=512,
86
+ temperature=temperature,
87
+ top_p=0.9,
88
+ top_k=10,
89
+ do_sample=False,
90
+ )
91
+
92
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
93
+
94
+ # Remove thinking if model inserts `<think>`
95
+ if "</think>" in output_text:
96
+ output_text = output_text.split("</think>")[-1].strip()
97
+
98
+ return output_text
99
+
100
+
101
+ # -----------------------------
102
+ # MAIN EXECUTION
103
+ # -----------------------------
104
+ if __name__ == "__main__":
105
+ import argparse
106
+ parser = argparse.ArgumentParser()
107
+ parser.add_argument("--input_file", type=str, required=True,
108
+ help="Path to the input JSON file containing medical texts.")
109
+ args = parser.parse_args()
110
+ INPUT_FILE = args.input_file
111
+ file_name=os.path.basename(INPUT_FILE).split(".json")[0]
112
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
113
+ MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
114
+
115
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
116
+
117
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}_en.json")
118
+
119
+ # Load input dataset
120
+ with open(INPUT_FILE, "r") as f:
121
+ data = json.load(f)
122
+
123
+ # Load existing results (resume mode)
124
+ result = []
125
+ if os.path.exists(OUTPUT_FILE):
126
+ with open(OUTPUT_FILE, "r") as f:
127
+ result = json.load(f)
128
+
129
+ existing_ids = {item["id"] for item in result}
130
+
131
+ # --------------------------------------------------------
132
+ # PROCESS EACH MEDICAL TEXT
133
+ # --------------------------------------------------------
134
+ for item in tqdm.tqdm(data):
135
+ if item["id"] in existing_ids:
136
+ continue
137
+
138
+ medical_text = item.get("fulltext", "")
139
+
140
+ extracted = infer_subclaims(
141
+ medical_text,
142
+ model_path=MODEL_PATH
143
+ )
144
+
145
+ result.append({
146
+ "id": item["id"],
147
+ "medical_text": medical_text,
148
+ "subclaims": extracted,
149
+ "summary": item.get("summary", "")
150
+ })
151
+
152
+ # Save every 20 entries
153
+ if len(result) % 20 == 0:
154
+ print(f"Saving intermediate results... Total processed: {len(result)}")
155
+ with open(OUTPUT_FILE, "w") as f:
156
+ json.dump(result, f, indent=4, ensure_ascii=False)
157
+
158
+ # Final save
159
+ with open(OUTPUT_FILE, "w") as f:
160
+ json.dump(result, f, indent=4, ensure_ascii=False)
161
+
162
+ print("Extraction completed.")
code/finetune-inference/old/inference_extract_subclaims_v2.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Set GPU environment variables
3
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
5
+ import torch
6
+ from unsloth import FastLanguageModel
7
+ import json
8
+ import tqdm
9
+ import argparse
10
+
11
+
12
+
13
+ # -----------------------------
14
+ # MODEL CACHE
15
+ # -----------------------------
16
+ _model_cache = {"model": None, "tokenizer": None}
17
+
18
+ def load_finetuned_model(model_path: str):
19
+ """Load and cache your fine-tuned subclaim extraction model + tokenizer."""
20
+ if _model_cache["model"] is not None:
21
+ return _model_cache["model"], _model_cache["tokenizer"]
22
+
23
+ model, tokenizer = FastLanguageModel.from_pretrained(
24
+ model_name=model_path,
25
+ max_seq_length=8192,
26
+ load_in_4bit=False,
27
+ load_in_8bit=False,
28
+ full_finetuning=False,
29
+ )
30
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
31
+ return model, tokenizer
32
+
33
+
34
+ # -----------------------------
35
+ # SUBCLAIM EXTRACTION PROMPT
36
+ # -----------------------------
37
+ def extraction_prompt(medical_text: str) -> str:
38
+ prompt = f"""
39
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
40
+ A subclaim is the smallest standalone factual unit that can be independently verified.
41
+
42
+ Instructions:
43
+ 1. Read the provided medical text.
44
+ 2. Break it into clear, objective, atomic subclaims.
45
+ 3. Each subclaim must come directly from the text.
46
+ 4. Do not add, guess, or infer information.
47
+ 5. Each subclaim should be short, specific, and verifiable.
48
+ 6. Return ONLY a Python-style list of strings.
49
+
50
+ Medical Text:
51
+ {medical_text}
52
+
53
+ Return your output in JSON list format, like:
54
+ [
55
+ "subclaim 1",
56
+ "subclaim 2",
57
+ ...
58
+ ]
59
+ """
60
+ return prompt
61
+
62
+
63
+ # -----------------------------
64
+ # INFERENCE FUNCTION
65
+ # -----------------------------
66
+ def infer_subclaims(medical_text: str,
67
+ model,
68
+ tokenizer,
69
+ temperature: float = 0.2) -> list:
70
+
71
+ if not medical_text or medical_text.strip() == "":
72
+ return []
73
+
74
+ prompt = extraction_prompt(medical_text)
75
+ messages = [{"role": "user", "content": prompt}]
76
+
77
+ chat_text = tokenizer.apply_chat_template(
78
+ messages,
79
+ tokenize=False,
80
+ add_generation_prompt=True,
81
+ enable_thinking=False,
82
+ )
83
+
84
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
85
+
86
+ with torch.no_grad():
87
+ output_ids = model.generate(
88
+ **inputs,
89
+ max_new_tokens=1024, # Increased to handle potentially longer list outputs
90
+ temperature=temperature,
91
+ top_p=0.9,
92
+ top_k=10,
93
+ do_sample=False,
94
+ )
95
+
96
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
97
+
98
+ # Remove thinking if model inserts `<think>`
99
+ if "</think>" in output_text:
100
+ output_text = output_text.split("</think>")[-1].strip()
101
+
102
+ # Try to parse as JSON list, return raw text if parsing fails
103
+ try:
104
+ # Finding the start and end of the JSON list in case there is conversational filler
105
+ start_idx = output_text.find('[')
106
+ end_idx = output_text.rfind(']') + 1
107
+ if start_idx != -1 and end_idx != -1:
108
+ return json.loads(output_text[start_idx:end_idx])
109
+ return output_text
110
+ except Exception:
111
+ return output_text
112
+
113
+
114
+ # -----------------------------
115
+ # MAIN EXECUTION
116
+ # -----------------------------
117
+ if __name__ == "__main__":
118
+ parser = argparse.ArgumentParser()
119
+ parser.add_argument("--input_file", type=str, required=True,
120
+ help="Path to the input JSON file containing medical texts.")
121
+ args = parser.parse_args()
122
+
123
+ INPUT_FILE = args.input_file
124
+ file_name = os.path.basename(INPUT_FILE).split(".json")[0]
125
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
126
+ MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
127
+
128
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
129
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}_en.json")
130
+
131
+ # Load Model once
132
+ model, tokenizer = load_finetuned_model(MODEL_PATH)
133
+
134
+ # Load input dataset
135
+ with open(INPUT_FILE, "r") as f:
136
+ data = json.load(f)
137
+
138
+ # Load existing results (resume mode)
139
+ result = []
140
+ if os.path.exists(OUTPUT_FILE):
141
+ with open(OUTPUT_FILE, "r") as f:
142
+ result = json.load(f)
143
+
144
+ existing_ids = {item["id"] for item in result}
145
+
146
+ # --------------------------------------------------------
147
+ # PROCESS EACH MEDICAL TEXT (Fulltext AND Summary)
148
+ # --------------------------------------------------------
149
+ for item in tqdm.tqdm(data):
150
+ if item.get("id") in existing_ids:
151
+ continue
152
+
153
+ # Extract from Fulltext
154
+ fulltext_content = item.get("fulltext", "")
155
+ fulltext_subclaims = infer_subclaims(fulltext_content, model, tokenizer)
156
+
157
+ # Extract from Summary
158
+ summary_content = item.get("summary", "")
159
+ summary_subclaims = infer_subclaims(summary_content, model, tokenizer)
160
+
161
+ result.append({
162
+ "id": item.get("id"),
163
+ "fulltext": fulltext_content,
164
+ "fulltext_subclaims": fulltext_subclaims,
165
+ "summary": summary_content,
166
+ "summary_subclaims": summary_subclaims,
167
+ "readability_score": item.get("readability_score", None)
168
+ })
169
+
170
+ # Save intermediate results
171
+ if len(result) % 20 == 0:
172
+ with open(OUTPUT_FILE, "w") as f:
173
+ json.dump(result, f, indent=4, ensure_ascii=False)
174
+
175
+ # Final save
176
+ with open(OUTPUT_FILE, "w") as f:
177
+ json.dump(result, f, indent=4, ensure_ascii=False)
178
+
179
+ print(f"Extraction completed. Saved to {OUTPUT_FILE}")
code/finetune-inference/old/inference_extract_subclaims_v3.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Set GPU environment variables
3
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
5
+ import torch
6
+ from unsloth import FastLanguageModel
7
+ import json
8
+ import tqdm
9
+ import argparse
10
+
11
+ # -----------------------------
12
+ # MODEL CACHE
13
+ # -----------------------------
14
+ _model_cache = {"model": None, "tokenizer": None}
15
+
16
+ def load_finetuned_model(model_path: str):
17
+ if _model_cache["model"] is not None:
18
+ return _model_cache["model"], _model_cache["tokenizer"]
19
+
20
+ model, tokenizer = FastLanguageModel.from_pretrained(
21
+ model_name=model_path,
22
+ max_seq_length=8192,
23
+ load_in_4bit=False,
24
+ load_in_8bit=False,
25
+ full_finetuning=False,
26
+ )
27
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
28
+ return model, tokenizer
29
+
30
+ # -----------------------------
31
+ # SUBCLAIM EXTRACTION PROMPT
32
+ # -----------------------------
33
+ def extraction_prompt(medical_text: str) -> str:
34
+ prompt = f"""
35
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
36
+ A subclaim is the smallest standalone factual unit that can be independently verified.
37
+
38
+ Instructions:
39
+ 1. Read the provided medical text.
40
+ 2. Break it into clear, objective, atomic subclaims.
41
+ 3. Each subclaim must come directly from the text.
42
+ 4. Return ONLY a valid JSON list of strings.
43
+
44
+ Medical Text:
45
+ {medical_text}
46
+
47
+ Return your output in JSON list format:
48
+ [
49
+ "subclaim 1",
50
+ "subclaim 2"
51
+ ]
52
+ """
53
+ return prompt
54
+
55
+ # -----------------------------
56
+ # INFERENCE FUNCTION WITH REPAIR
57
+ # -----------------------------
58
+ def infer_subclaims(medical_text: str, model, tokenizer, temperature: float = 0.2, max_tokens: int = 2048) -> list:
59
+ if not medical_text or medical_text.strip() == "":
60
+ return []
61
+
62
+ prompt = extraction_prompt(medical_text)
63
+ messages = [{"role": "user", "content": prompt}]
64
+
65
+ chat_text = tokenizer.apply_chat_template(
66
+ messages,
67
+ tokenize=False,
68
+ add_generation_prompt=True,
69
+ )
70
+
71
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
72
+
73
+ with torch.no_grad():
74
+ output_ids = model.generate(
75
+ **inputs,
76
+ max_new_tokens=max_tokens, # Increased default
77
+ temperature=temperature,
78
+ do_sample=False,
79
+ )
80
+
81
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
82
+
83
+ # Remove reasoning/thinking if present
84
+ if "</think>" in output_text:
85
+ output_text = output_text.split("</think>")[-1].strip()
86
+
87
+ # Attempt to parse
88
+ try:
89
+ start_idx = output_text.find('[')
90
+ end_idx = output_text.rfind(']') + 1
91
+ if start_idx != -1 and end_idx != -1:
92
+ parsed = json.loads(output_text[start_idx:end_idx])
93
+ if isinstance(parsed, list):
94
+ return parsed
95
+ return [output_text] # Wrap in list if it's just raw text
96
+ except Exception:
97
+ return [output_text]
98
+
99
+ # -----------------------------
100
+ # MAIN EXECUTION
101
+ # -----------------------------
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser()
104
+ parser.add_argument("--input_file", type=str, required=True)
105
+ args = parser.parse_args()
106
+
107
+ INPUT_FILE = args.input_file
108
+ file_name = os.path.basename(INPUT_FILE).split(".json")[0]
109
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
110
+ MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
111
+
112
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
113
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}_en.json")
114
+
115
+ model, tokenizer = load_finetuned_model(MODEL_PATH)
116
+
117
+ # Load input dataset
118
+ with open(INPUT_FILE, "r") as f:
119
+ data = json.load(f)
120
+
121
+ # Load existing results
122
+ result = []
123
+ if os.path.exists(OUTPUT_FILE):
124
+ with open(OUTPUT_FILE, "r") as f:
125
+ result = json.load(f)
126
+
127
+ # Convert results to a dict for easy lookup/update
128
+ processed_data = {item["id"]: item for item in result}
129
+
130
+ for item in tqdm.tqdm(data):
131
+ item_id = item.get("id")
132
+ existing_entry = processed_data.get(item_id)
133
+
134
+ # CHECK LOGIC:
135
+ # If entry exists and subclaims are already valid lists, we skip.
136
+ # If they are strings or missing, we re-run with higher tokens.
137
+
138
+ # 1. Check Fulltext Subclaims
139
+ fulltext_needs_update = (
140
+ not existing_entry or
141
+ not isinstance(existing_entry.get("fulltext_subclaims"), list) or
142
+ len(existing_entry.get("fulltext_subclaims", [])) == 0
143
+ )
144
+
145
+ if fulltext_needs_update:
146
+ f_sub = infer_subclaims(item.get("fulltext", ""), model, tokenizer, max_tokens=3072)
147
+ else:
148
+ f_sub = existing_entry["fulltext_subclaims"]
149
+
150
+ # 2. Check Summary Subclaims
151
+ summary_needs_update = (
152
+ not existing_entry or
153
+ not isinstance(existing_entry.get("summary_subclaims"), list) or
154
+ len(existing_entry.get("summary_subclaims", [])) == 0
155
+ )
156
+
157
+ if summary_needs_update:
158
+ s_sub = infer_subclaims(item.get("summary", ""), model, tokenizer, max_tokens=2048)
159
+ else:
160
+ s_sub = existing_entry["summary_subclaims"]
161
+
162
+ # Update or Append
163
+ new_entry = {
164
+ "id": item_id,
165
+ "fulltext": item.get("fulltext", ""),
166
+ "fulltext_subclaims": f_sub,
167
+ "summary": item.get("summary", ""),
168
+ "summary_subclaims": s_sub,
169
+ "readability_score": item.get("readability_score", None)
170
+ }
171
+ processed_data[item_id] = new_entry
172
+
173
+ # Intermediate save
174
+ if len(processed_data) % 20 == 0:
175
+ with open(OUTPUT_FILE, "w") as f:
176
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
177
+
178
+ # Final save
179
+ with open(OUTPUT_FILE, "w") as f:
180
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
181
+
182
+ print(f"Refinement completed. Total records: {len(processed_data)}")
code/finetune-inference/old/nemotran_inference.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
5
+
6
+ import os
7
+ import json
8
+ import tqdm
9
+ import argparse
10
+ import torch
11
+ from unsloth import FastLanguageModel
12
+
13
+ # -----------------------------
14
+ # UNSLOTH MODEL CONFIGURATION
15
+ # -----------------------------
16
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/nemotron-3-nano-30b-a3b_subclaims-support-check-8b_ctx_v2-bf16"
17
+ max_seq_length = 2048 # Adjusted for medical text + reasoning context
18
+ dtype = None # Auto-detection for A100 (will likely use bfloat16)
19
+ load_in_4bit = True # To fit 32B model comfortably on A100
20
+
21
+ # Load model and tokenizer natively
22
+ model, tokenizer = FastLanguageModel.from_pretrained(
23
+ model_name = MODEL_PATH,
24
+ max_seq_length = max_seq_length,
25
+ dtype = dtype,
26
+ load_in_4bit = load_in_4bit,
27
+ trust_remote_code = True,
28
+ )
29
+
30
+ # Enable 2x faster native inference
31
+ FastLanguageModel.for_inference(model)
32
+
33
+ # -----------------------------
34
+ # VERIFICATION PROMPT
35
+ # -----------------------------
36
+ def inference_prompt(text, subclaim):
37
+ # This remains the same as your clinical evidence auditor prompt
38
+ return f"""You are a clinical evidence auditor. Your evaluation must be based STRICTLY and ONLY on the provided medical text.
39
+
40
+ ### MANDATORY GROUNDING RULES:
41
+ 1. NO OUTSIDE KNOWLEDGE: Do not use your internal medical knowledge. Even if a subclaim is "common sense" in medicine, if it is not explicitly in the TEXT, it is 'not_supported'.
42
+ 2. NO LOGICAL LEAPS: Do not bridge gaps in logic. (e.g., If the text mentions "high blood sugar" but not the word "diabetes", you cannot support a claim of "diabetes").
43
+ 3. EXACT NUMERICAL MATCHING: Any doses (e.g., 500mg), frequencies (e.g., twice daily), or durations (e.g., 10 days) mentioned in the subclaim must match the text perfectly. If they are missing or different in the text, label as 'not_supported'.
44
+ 4. DEFAULT TO NOT SUPPORTED: If the text is vague, ambiguous, or only suggests a possibility, you MUST choose 'not_supported'.
45
+ 5. CLOSED-WORLD REALITY: Treat the TEXT as the only information that exists in the world.
46
+
47
+ ### Medical Text:
48
+ {text}
49
+
50
+ ### Subclaim:
51
+ {subclaim}
52
+
53
+ Output exactly one word ('supported' or 'not_supported') based on the strict rules above:"""
54
+
55
+ # -----------------------------
56
+ # VERIFICATION LOGIC (UNSLOTH VERSION)
57
+ # -----------------------------
58
+ def check_support(text: str, subclaim: str, error_log=None) -> str:
59
+ if not text or not subclaim:
60
+ return "not_supported"
61
+
62
+ prompt_content = inference_prompt(text, subclaim)
63
+
64
+ # Format for Chat Template (assuming Qwen3 uses IM_START/IM_END)
65
+ messages = [{"role": "user", "content": prompt_content}]
66
+ inputs = tokenizer.apply_chat_template(
67
+ messages,
68
+ tokenize = True,
69
+ add_generation_prompt = True,
70
+ return_tensors = "pt",
71
+ ).to("cuda")
72
+
73
+ try:
74
+ # Inference using the same parameters as your API call
75
+ outputs = model.generate(
76
+ input_ids = inputs,
77
+ max_new_tokens = 512, # Kept from your max_tokens=512
78
+ temperature = 0.1, # Kept from your temperature=0.1
79
+ use_cache = True,
80
+ )
81
+
82
+ # Extract response and handle thinking tokens if present
83
+ res = tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
84
+ res = res.strip().lower()
85
+
86
+ if "</think>" in res:
87
+ res = res.split("</think>")[1].strip().lower()
88
+
89
+ if "not_supported" in res:
90
+ return "not_supported"
91
+ elif "supported" in res:
92
+ return "supported"
93
+ elif "refuted" in res:
94
+ return "refuted"
95
+ else:
96
+ return "not_supported"
97
+
98
+ except Exception as e:
99
+ if error_log is not None:
100
+ error_details = {"subclaim": subclaim, "error_msg": str(e), "type": "LOCAL_INFERENCE_ERROR"}
101
+ error_log.append(error_details)
102
+ return "not_supported"
103
+
104
+ # -----------------------------
105
+ # MAIN (Processing logic remains largely identical)
106
+ # -----------------------------
107
+ if __name__ == "__main__":
108
+ parser = argparse.ArgumentParser()
109
+ parser.add_argument("--input_file", type=str,
110
+ default="/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json")
111
+ parser.add_argument("--save_folder", type=str,
112
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_testing")
113
+ parser.add_argument("--start_index", type=int, default=0)
114
+ parser.add_argument("--end_index", type=int, default=-1)
115
+
116
+ args = parser.parse_args()
117
+
118
+ INPUT_FILE = args.input_file
119
+ SAVE_FOLDER = args.save_folder
120
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
121
+
122
+ with open(INPUT_FILE, "r") as f:
123
+ all_data = json.load(f)
124
+
125
+ total_len = len(all_data)
126
+ start = args.start_index
127
+ end = args.end_index if args.end_index != -1 else total_len
128
+ data_slice = all_data[start:min(end, total_len)]
129
+
130
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}_nemotran-30B.json")
131
+
132
+ processed_results = []
133
+ if os.path.exists(OUTPUT_FILE):
134
+ try:
135
+ with open(OUTPUT_FILE, "r") as f:
136
+ processed_results = json.load(f)
137
+ except:
138
+ processed_results = []
139
+
140
+ processed_ids = {item['medical_text'] for item in processed_results}
141
+ global_error_log = []
142
+
143
+ pbar = tqdm.tqdm(data_slice)
144
+
145
+ for item in pbar:
146
+ text = item.get('full_text', '')
147
+ if text in processed_ids: continue # Simple skip logic for resume
148
+
149
+ subclaims = item.get('dat', {}).get('dat', [])
150
+
151
+ for subclaim_obj in subclaims:
152
+ subclaim_text = subclaim_obj.get('subclaim', '')
153
+ label_gt = subclaim_obj.get('status', 'not_supported').strip().lower()
154
+
155
+ label_gen = check_support(text, subclaim_text, error_log=global_error_log)
156
+
157
+ correctness = (label_gen == label_gt)
158
+
159
+ result_entry = {
160
+ "medical_text": text,
161
+ "subclaim": subclaim_text,
162
+ "label_gt": label_gt,
163
+ "label_gen": label_gen,
164
+ "correctness": correctness
165
+ }
166
+ processed_results.append(result_entry)
167
+
168
+ # Intermediate Save
169
+ with open(OUTPUT_FILE, "w") as f:
170
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
171
+
172
+ # Final Save
173
+ with open(OUTPUT_FILE, "w") as f:
174
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
code/finetune-inference/old/prompt_generate.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ALL_PROMPTS = {
2
+ "en": {
3
+ "B1": """You are a summarization assistant. Your single most important goal is to rewrite medical text for a first-grade reading level (ages 5-7, FKGL 1.0-4.0). Simplicity is more important than detail.
4
+
5
+ Core Mandate:
6
+ - TARGET AUDIENCE: A 6-year-old child.
7
+ - PRIMARY GOAL: Extreme simplicity. If you must choose between accuracy of detail and simplicity, ALWAYS choose simplicity.
8
+
9
+ Strict Rules You Must Follow:
10
+ - SENTENCE LENGTH: Keep almost all sentences under 10 words. Use very short, simple sentences.
11
+ - VOCABULARY: Use only very common, everyday words that a first-grader would know. Avoid any medical or scientific terms. Instead of 'femur', say 'thigh bone'. Instead of 'benign', say 'not harmful'.
12
+ - TONE: Be very gentle, calm, and reassuring. Like a kind doctor explaining something to a small child.
13
+ - STRUCTURE: Use short paragraphs, often just one or two sentences long.
14
+ - FOCUS: Only mention the most important one or two points from the original text. Omit all other details.
15
+
16
+ - Never use emojis.
17
+ - Do not explain pronunciation.
18
+ - DO NOT use any medical jargon.
19
+ """,
20
+ "B2": """You are a summarization assistant trained to rewrite medical summaries for a middle school reading level (ages 11–14, FKGL 6.0–9.0). Your goal is clarity for a teenager with a basic understanding of biology.
21
+
22
+ Core Mandate:
23
+ - TARGET AUDIENCE: A 14-year-old in a 9th-grade biology class.
24
+ - PRIMARY GOAL: Clarity and straightforward explanation.
25
+
26
+ Strict Rules You Must Follow:
27
+ - SENTENCE LENGTH: Vary sentence length, but aim for an average of 12-18 words. Avoid long, complex sentences.
28
+ - VOCABULARY: You can use basic medical terms (e.g., 'biopsy', 'cells', 'tumor'), but you MUST explain them in simple terms immediately. For example: "A biopsy, which is when a small piece of tissue is taken for testing...".
29
+ - TONE: Be empathetic but direct. Use an educational and informative tone, like a science teacher.
30
+ - STRUCTURE: Organize the summary into logical paragraphs. You can use simple headings if it helps clarity (e.g., "What They Found," "What It Means").
31
+ - FOCUS: Summarize the main findings and their implications. Omit minor or highly technical details.
32
+
33
+ - Never use emojis.
34
+ - Do not explain pronunciation.
35
+ """,
36
+ "B3": """You are a summarization assistant trained to rewrite medical summaries for an educated, non-medical adult (ages 17+, FKGL 12.0+). Your goal is to be precise, comprehensive, and clear for a college-level reader.
37
+
38
+ Core Mandate:
39
+ - TARGET AUDIENCE: A curious college student or adult with no medical training.
40
+ - PRIMARY GOAL: Precision and structured clarity.
41
+
42
+ Strict Rules You Must Follow:
43
+ - SENTENCE LENGTH: Use clear, well-constructed sentences. Complex sentences are acceptable if they enhance clarity and precision.
44
+ - VOCABULARY: Use correct medical terminology. You can assume the reader can understand terms from context or look them up, but for very specialized terms, provide a brief parenthetical explanation. For example: "...showed evidence of hyperplasia (an increase in the number of cells)."
45
+ - TONE: Maintain a professional, empathetic, and respectful tone. Be authoritative but not clinical or cold.
46
+ - STRUCTURE: Provide a detailed and structured summary. Use headings to organize information, such as "Background," "Key Findings," "Clinical Interpretation," and "Next Steps."
47
+ - FOCUS: Be comprehensive and faithful to the source summary. Include important details, test results, and differential diagnoses mentioned in the source.
48
+
49
+ - Never use emojis.
50
+ - Do not explain pronunciation.
51
+ """
52
+ },
53
+ "es": {
54
+ "B1": """Eres un asistente de resumen. Tu único y más importante objetivo es reescribir texto médico para un nivel de lectura de primer grado (edades 5-7). La simplicidad es más importante que el detalle.
55
+
56
+ Mandato Principal:
57
+ - PÚBLICO OBJETIVO: Un niño de 6 años.
58
+ - OBJETIVO PRIMARIO: Simplicidad extrema. Si debes elegir entre la precisión del detalle y la simplicidad, SIEMPRE elige la simplicidad.
59
+
60
+ Reglas Estrictas que Debes Seguir:
61
+ - IDIOMA: El resumen DEBE estar escrito en español.
62
+ - LONGITUD DE LA ORACIÓN: Casi todas las oraciones deben tener menos de 10 palabras. Usa frases muy cortas y simples.
63
+ - VOCABULARIO: Usa solo palabras cotidianas y muy comunes que un niño de primer grado conocería. Evita cualquier término médico o científico. En lugar de 'fémur', di 'hueso del muslo'. En lugar de 'benigno', di 'que no es dañino'.
64
+ - TONO: Sé muy gentil, calmado y tranquilizador. Como un doctor amable explicándole algo a un niño pequeño.
65
+ - ESTRUCTURA: Usa párrafos cortos, a menudo de solo una o dos oraciones.
66
+ - ENFOQUE: Menciona solo el punto más importante o los dos puntos más importantes del texto original. Omite todos los demás detalles.
67
+
68
+ - Nunca uses emojis.
69
+ - No expliques la pronunciación.
70
+ - NO uses jerga médica.
71
+ """,
72
+ "B2": """Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un nivel de lectura de secundaria (edades 11–14). Tu objetivo es la claridad para un adolescente con conocimientos básicos de biología.
73
+
74
+ Mandato Principal:
75
+ - PÚBLICO OBJETIVO: Un estudiante de 14 años en una clase de biología de secundaria.
76
+ - OBJETIVO PRIMARIO: Claridad y explicación directa.
77
+
78
+ Reglas Estrictas que Debes Seguir:
79
+ - IDIOMA: El resumen DEBE estar escrito en español.
80
+ - LONGITUD DE LA ORACIÓN: Varía la longitud de las oraciones, pero busca un promedio de 12-18 palabras. Evita las oraciones largas y complejas.
81
+ - VOCABULARIO: Puedes usar términos médicos básicos (ej., 'biopsia', 'células', 'tumor'), pero DEBES explicarlos en términos sencillos inmediatamente. Por ejemplo: "Una biopsia, que es cuando se toma un pequeño trozo de tejido para analizarlo...".
82
+ - TONO: Sé empático pero directo. Usa un tono educativo e informativo, como un profesor de ciencias.
83
+ - ESTRUCTURA: Organiza el resumen en párrafos lógicos. Puedes usar encabezados simples si ayuda a la claridad (ej., "Lo que Encontraron," "Qué Significa").
84
+ - ENFOQUE: Resume los hallazgos principales y sus implicaciones. Omite detalles menores o muy técnicos.
85
+
86
+ - Nunca uses emojis.
87
+ - No expliques la pronunciación.
88
+ """,
89
+ "B3": """Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un adulto educado no médico (edades 17+). Tu objetivo es ser preciso, completo y claro para un lector de nivel universitario.
90
+
91
+ Mandato Principal:
92
+ - PÚBLICO OBJETIVO: Un estudiante universitario o un adulto curioso sin formación médica.
93
+ - OBJETIVO PRIMARIO: Precisión y claridad estructurada.
94
+
95
+ Reglas Estrictas que Debes Seguir:
96
+ - IDIOMA: El resumen DEBE estar escrito en español.
97
+ - LONGITUD DE LA ORACIÓN: Usa oraciones claras y bien construidas. Las oraciones complejas son aceptables si mejoran la claridad y la precisión.
98
+ - VOCABULARIO: Usa la terminología médica correcta. Puedes asumir que el lector puede entender los términos por el contexto o buscarlos, pero para términos muy especializados, proporciona una breve explicación entre paréntesis. Por ejemplo: "...mostró evidencia de hiperplasia (un aumento en el número de células)."
99
+ - TONO: Mantén un tono profesional, empático y respetuoso. Sé autoritario pero no clínico o frío.
100
+ - ESTRUCTURA: Proporciona un resumen detallado y estructurado. Usa encabezados para organizar la información, como "Contexto," "Hallazgos Clave," "Interpretación Clínica," y "Próximos Pasos."
101
+ - ENFOQUE: Sé completo y fiel al resumen original. Incluye detalles importantes, resultados de pruebas y diagnósticos diferenciales mencionados en la fuente.
102
+
103
+ - Nunca uses emojis.
104
+ - No expliques la pronunciación.
105
+ """
106
+ },
107
+ "fr": {
108
+ "B1": """Vous êtes un assistant de résumé. Votre unique et plus important objectif est de réécrire un texte médical pour un niveau de lecture de cours préparatoire (âges 5-7). La simplicité est plus importante que le détail.
109
+
110
+ Mandat Principal :
111
+ - PUBLIC CIBLE : Un enfant de 6 ans.
112
+ - OBJECTIF PRINCIPAL : Simplicité extrême. Si vous devez choisir entre la précision des détails et la simplicité, choisissez TOUJOURS la simplicité.
113
+
114
+ Règles Strictes à Suivre Impérativement :
115
+ - LANGUE : Le résumé DOIT être rédigé en français.
116
+ - LONGUEUR DES PHRASES : Presque toutes les phrases doivent faire moins de 10 mots. Utilisez des phrases très courtes et simples.
117
+ - VOCABULAIRE : Utilisez uniquement des mots très courants et quotidiens qu'un enfant de cet âge connaîtrait. Évitez tout terme médical ou scientifique. Au lieu de 'fémur', dites 'l'os de la cuisse'. Au lieu de 'bénin', dites 'pas dangereux'.
118
+ - TON : Soyez très doux, calme et rassurant. Comme un médecin bienveillant qui explique quelque chose à un jeune enfant.
119
+ - STRUCTURE : Utilisez des paragraphes courts, souvent composés d'une ou deux phrases seulement.
120
+ - ENFOQUE : Mentionnez uniquement le ou les deux points les plus importants du texte original. Omettez tous les autres détails.
121
+
122
+ - N'utilisez jamais d'emojis.
123
+ - N'expliquez pas la prononciation.
124
+ - N'utilisez AUCUN jargon médical.
125
+ """,
126
+ "B2": """Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un niveau de lecture de collège (âges 11–14). Votre objectif est la clarté pour un adolescent ayant une compréhension de base de la biologie.
127
+
128
+ Mandat Principal :
129
+ - PUBLIC CIBLE : Un adolescent de 14 ans en classe de biologie au collège.
130
+ - OBJECTIF PRINCIPAL : Clarté et explication directe.
131
+
132
+ Règles Strictes à Suivre Impérativement :
133
+ - LANGUE : Le résumé DOIT être rédigé en français.
134
+ - LONGUEUR DES PHRASES : Variez la longueur des phrases, mais visez une moyenne de 12-18 mots. Évitez les phrases longues et complexes.
135
+ - VOCABULAIRE : Vous pouvez utiliser des termes médicaux de base (ex: 'biopsie', 'cellules', 'tumeur'), mais vous DEVEZ les expliquer en termes simples immédiatement. Par exemple : "Une biopsie, c'est-à-dire quand on prélève un petit morceau de tissu pour l'analyser...".
136
+ - TON : Soyez empathique mais direct. Adoptez un ton pédagogique et informatif, comme un professeur de sciences.
137
+ - STRUCTURE : Organisez le résumé en paragraphes logiques. Vous pouvez utiliser des titres simples si cela améliore la clarté (ex: "Ce qu'ils ont trouvé", "Ce que cela signifie").
138
+ - ENFOQUE : Résumez les principales observations et leurs implications. Omettez les détails mineurs ou très techniques.
139
+
140
+ - N'utilisez jamais d'emojis.
141
+ - N'expliquez pas la prononciation.
142
+ """,
143
+ "B3": """Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un adulte éduqué non-médecin (âges 17+). Votre objectif est d'être précis, complet et clair pour un lecteur de niveau universitaire.
144
+
145
+ Mandat Principal :
146
+ - PUBLIC CIBLE : Un étudiant ou un adulte curieux sans formation médicale.
147
+ - OBJECTIF PRINCIPAL : Précision et clarté structurée.
148
+
149
+ Règles Strictes à Suivre Impérativement :
150
+ - LANGUE : Le résumé DOIT être rédigé en français.
151
+ - LONGUEUR DES PHRASES : Utilisez des phrases claires et bien construites. Les phrases complexes sont acceptables si elles améliorent la clarté et la précision.
152
+ - VOCABULAIRE : Utilisez la terminologie médicale correcte. Vous pouvez supposer que le lecteur peut comprendre les termes par le contexte ou les rechercher, mais pour les termes très spécialisés, fournissez une brève explication entre parenthèses. Par exemple : "...montrait des signes d'hyperplasie (une augmentation du nombre de cellules)."
153
+ - TON : Maintenez un ton professionnel, empathique et respectueux. Soyez directif mais ni clinique ni froid.
154
+ - STRUCTURE : Fournissez un résumé détaillé et structuré. Utilisez des titres pour organiser l'information, tels que "Contexte", "Principales Observations", "Interprétation Clinique" et "Prochaines Étapes".
155
+ - ENFOQUE : Soyez complet et fidèle au résumé source. Incluez les détails importants, les résultats des tests et les diagnostics différentiels mentionnés dans la source.
156
+
157
+ - N'utilisez jamais d'emojis.
158
+ - N'expliquez pas la prononciation.
159
+ """
160
+ },
161
+
162
+ "pt": {
163
+ "B1": """Você é um assistente de resumo. O seu único e mais importante objetivo é reescrever textos médicos para um nível de leitura da primeira série (idades 5-7). A simplicidade é mais importante que os detalhes.
164
+
165
+ Mandato Principal:
166
+ - PÚBLICO-ALVO: Uma criança de 6 anos.
167
+ - OBJETIVO PRINCIPAL: Simplicidade extrema. Se tiver que escolher entre a precisão dos detalhes e a simplicidade, ESCOLHA SEMPRE a simplicidade.
168
+
169
+ Regras Rígidas que Você Deve Seguir:
170
+ - IDIOMA: O resumo DEVE ser escrito em português.
171
+ - COMPRIMENTO DAS FRASES: Quase todas as frases devem ter menos de 10 palavras. Use frases muito curtas e simples.
172
+ - VOCABULÁRIO: Use apenas palavras quotidianas e muito comuns que uma criança da primeira série conheceria. Evite qualquer termo médico ou científico. Em vez de 'fêmur', diga 'o osso da coxa'. Em vez de 'benigno', diga 'que não faz mal'.
173
+ - TOM: Seja muito gentil, calmo e tranquilizador. Como um médico amável a explicar algo a uma criança pequena.
174
+ - ESTRUTURA: Use parágrafos curtos, muitas vezes com apenas uma ou duas frases.
175
+ - FOCO: Mencione apenas um ou dois dos pontos mais importantes do texto original. Omita todos os outros detalhes.
176
+
177
+ - Nunca use emojis.
178
+ - Não explique a pronúncia.
179
+ - NÃO use NENHUM jargão médico.
180
+ """,
181
+ "B2": """Você é um assistente de resumo treinado para reescrever resumos médicos para um nível de leitura do ensino fundamental II (idades 11–14). O seu objetivo é a clareza para um adolescente com conhecimentos básicos de biologia.
182
+
183
+ Mandato Principal:
184
+ - PÚBLICO-ALVO: Um adolescente de 14 anos numa aula de biologia.
185
+ - OBJETIVO PRINCIPAL: Clareza e explicação direta.
186
+
187
+ Regras Rígidas que Você Deve Seguir:
188
+ - IDIOMA: O resumo DEVE ser escrito em português.
189
+ - COMPRIMENTO DAS FRASES: Varie o comprimento das frases, mas procure uma média de 12 a 18 palavras. Evite frases longas e complexas.
190
+ - VOCABULÁRIO: Pode usar termos médicos básicos (ex: 'biópsia', 'células', 'tumor'), mas você DEVE explicá-los em termos simples imediatamente. Por exemplo: "Uma biópsia, que é quando um pequeno pedaço de tecido é retirado para ser analisado...".
191
+ - TOM: Seja empático, mas direto. Use um tom educativo e informativo, como um professor de ciências.
192
+ - ESTRUTURA: Organize o resumo em parágrafos lógicos. Pode usar títulos simples se isso ajudar na clareza (ex: "O que eles encontraram", "O que isso significa").
193
+ - FOCO: Resuma os principais achados e as suas implicações. Omita detalhes menores ou muito técnicos.
194
+
195
+ - Nunca use emojis.
196
+ - Não explique a pronúncia.
197
+ """,
198
+ "B3": """Você é um assistente de resumo treinado para reescrever resumos médicos para um adulto instruído, mas sem formação médica (idades 17+). O seu objetivo é ser preciso, abrangente e claro para um leitor de nível universitário.
199
+
200
+ Mandato Principal:
201
+ - PÚBLICO-ALVO: Um estudante universitário ou adulto curioso sem formação médica.
202
+ - OBJETIVO PRINCIPAL: Precisão e clareza estruturada.
203
+
204
+ Regras Rígidas que Você Deve Seguir:
205
+ - IDIOMA: O resumo DEVE ser escrito em português.
206
+ - COMPRIMENTO DAS FRASES: Use frases claras e bem construídas. Frases complexas são aceitáveis se melhorarem a clareza e a precisão.
207
+ - VOCABULÁRIO: Use a terminologia médica correta. Pode assumir que o leitor consegue entender os termos pelo contexto ou pesquisá-los, mas para termos muito especializados, forneça uma breve explicação entre parênteses. Por exemplo: "...mostrou evidência de hiperplasia (um aumento no número de células)."
208
+ - TOM: Mantenha um tom profissional, empático e respeitoso. Seja confiante, mas não clínico ou frio.
209
+ - ESTRUTURA: Forneça um resumo detalhado e estruturado. Use títulos para organizar a informação, como "Contexto", "Principais Achados", "Interpretação Clínica" e "Próximos Passos".
210
+ - FOCO: Seja abrangente e fiel ao resumo original. Inclua detalhes importantes, resultados de testes e diagnósticos diferenciais mencionados na fonte.
211
+
212
+ - Nunca use emojis.
213
+ - Não explique a pronúncia.
214
+ """
215
+ }
216
+
217
+ }
218
+ USER_PROMPT_TEMPLATES = {
219
+ "en": """Please rewrite the following expert summary for the specified target audience. Use the full article for context if needed.
220
+ **Full Article Context:**
221
+ {article}
222
+ **Expert Summary to Rewrite:**
223
+ {gold_summary}
224
+ """,
225
+ "es": """Por favor, reescribe el siguiente resumen de experto para el público objetivo especificado. Usa el artículo completo como contexto si es necesario.
226
+ **Contexto del Artículo Completo:**
227
+ {article}
228
+ **Resumen de Experto a Reescribir:**
229
+ {gold_summary}
230
+ """,
231
+ "fr": """Veuillez réécrire le résumé d'expert suivant pour le public cible spécifié. Utilisez l'article complet comme contexte si nécessaire.
232
+ **Contexte de l'Article Complet :**
233
+ {article}
234
+ **Résumé d'Expert à Réécrire :**
235
+ {gold_summary}
236
+ """,
237
+ "pt": """Por favor, reescreva o seguinte resumo de especialista para o público-alvo especificado. Use o artigo completo como contexto, se necessário.
238
+ **Contexto do Artigo Completo:**
239
+ {article}
240
+ **Resumo do Especialista a Ser Reescrito:**
241
+ {gold_summary}
242
+ """
243
+ }
244
+
245
+ def generate_prompt(article, gold_summary, band, lang):
246
+ """Call an OpenAI model to generate a synthetic summary for a given readability band and language."""
247
+ prompts_for_lang = ALL_PROMPTS.get(lang)
248
+ user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)
249
+ if not prompts_for_lang or not user_prompt_template:
250
+ raise ValueError(f"No prompts available for language: {lang}")
251
+
252
+ system_prompt = prompts_for_lang[band]
253
+ user_prompt = user_prompt_template.format(article=article, gold_summary=gold_summary)
254
+ return system_prompt + "\n" + user_prompt
code/finetune-inference/old/statistics.ipynb ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "1408eea5",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "\n",
12
+ "with open('/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json', 'r') as f:\n",
13
+ " data_item = json.load(f)\n",
14
+ "data = []\n",
15
+ "for item in data_item:\n",
16
+ " attribution=item['attribution']['accuracy']\n",
17
+ " data.append(attribution)"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "c706e713",
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "import numpy as np\n",
28
+ "import pandas as pd\n",
29
+ "import seaborn as sns\n",
30
+ "import matplotlib.pyplot as plt\n",
31
+ "from scipy import stats\n",
32
+ "\n",
33
+ "# Example data list\n",
34
+ "# data = [12, 15, 14, 18, 19, 17, 21]\n",
35
+ "\n",
36
+ "# Convert to a pandas Series for convenience\n",
37
+ "s = pd.Series(data)\n",
38
+ "\n",
39
+ "# --- 1. Basic statistics ---\n",
40
+ "summary = s.describe()\n",
41
+ "print(\"Basic statistics:\")\n",
42
+ "print(summary)\n",
43
+ "\n",
44
+ "# Extra metrics\n",
45
+ "print(\"\\nAdditional info:\")\n",
46
+ "print(f\"Variance: {s.var():.2f}\")\n",
47
+ "print(f\"Skewness: {s.skew():.2f}\")\n",
48
+ "print(f\"Kurtosis: {s.kurt():.2f}\")\n",
49
+ "print(f\"Mode: {s.mode().tolist()}\")\n",
50
+ "\n",
51
+ "# --- 2. Visualization ---\n",
52
+ "plt.figure(figsize=(8, 5))\n",
53
+ "sns.histplot(s, bins=10, kde=True, color='skyblue', edgecolor='black')\n",
54
+ "plt.title(\"Distribution curve of data\")\n",
55
+ "plt.xlabel(\"Value\")\n",
56
+ "plt.ylabel(\"Frequency\")\n",
57
+ "plt.show()"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "860aff4b",
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "import pandas as pd\n",
68
+ "\n",
69
+ "s = pd.Series(data) # sample data with an outlier\n",
70
+ "\n",
71
+ "# Compute IQR boundaries\n",
72
+ "Q1 = s.quantile(0.25)\n",
73
+ "Q3 = s.quantile(0.75)\n",
74
+ "IQR = Q3 - Q1\n",
75
+ "\n",
76
+ "lower_lim = Q1 - 1.5 * IQR\n",
77
+ "upper_lim = Q3 + 1.5 * IQR\n",
78
+ "\n",
79
+ "cleaned = s[(s >= lower_lim) & (s <= upper_lim)]\n",
80
+ "\n",
81
+ "print(\"Cleaned data:\")\n",
82
+ "print(len(cleaned.tolist()))\n",
83
+ "import seaborn as sns\n",
84
+ "import matplotlib.pyplot as plt\n",
85
+ "\n",
86
+ "sns.boxplot(x=s, color=\"lightblue\")\n",
87
+ "plt.title(\"Before cleaning\")\n",
88
+ "plt.show()\n",
89
+ "\n",
90
+ "sns.boxplot(x=cleaned, color=\"lightgreen\")\n",
91
+ "plt.title(\"After IQR cleaning\")\n",
92
+ "plt.show()"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": null,
98
+ "id": "4b1f16b3",
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "import numpy as np\n",
103
+ "from scipy import stats\n",
104
+ "\n",
105
+ "z_scores = np.abs(stats.zscore(s))\n",
106
+ "threshold = 3 # commonly used threshold\n",
107
+ "cleaned_z = s[z_scores < threshold]\n",
108
+ "print(len(cleaned_z.tolist()))\n",
109
+ "import seaborn as sns\n",
110
+ "import matplotlib.pyplot as plt\n",
111
+ "print(\"Cleaned data (Z-score method):\")\n",
112
+ "sns.boxplot(x=s, color=\"lightblue\")\n",
113
+ "plt.title(\"Before cleaning\")\n",
114
+ "plt.show()\n",
115
+ "\n",
116
+ "sns.boxplot(x=cleaned_z, color=\"lightgreen\")\n",
117
+ "plt.title(\"After Z-score cleaning\")\n",
118
+ "plt.show()"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "id": "4394d44c",
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": []
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "8e24c8c2",
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": []
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "f97f821e",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "import json\n",
145
+ "import pandas as pd\n",
146
+ "import matplotlib.pyplot as plt\n",
147
+ "import seaborn as sns\n",
148
+ "\n",
149
+ "def analyze_doclens_results(file_path):\n",
150
+ " \"\"\"\n",
151
+ " Loads, parses, and analyzes the DOCLENS evaluation results from a JSON file.\n",
152
+ "\n",
153
+ " Args:\n",
154
+ " file_path (str): The path to the JSON results file.\n",
155
+ "\n",
156
+ " Returns:\n",
157
+ " pandas.DataFrame: A DataFrame with the aggregated mean scores.\n",
158
+ " \"\"\"\n",
159
+ " # Load the entire JSON file\n",
160
+ " try:\n",
161
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
162
+ " data = json.load(f)\n",
163
+ " except FileNotFoundError:\n",
164
+ " print(f\"Error: The file '{file_path}' was not found.\")\n",
165
+ " return None\n",
166
+ " except json.JSONDecodeError:\n",
167
+ " print(f\"Error: The file '{file_path}' is not a valid JSON file.\")\n",
168
+ " return None\n",
169
+ "\n",
170
+ " # Parse the nested data into a flat list of dictionaries\n",
171
+ " parsed_data = []\n",
172
+ " for record in data:\n",
173
+ " record_id = record.get(\"id\")\n",
174
+ " version = record.get(\"version\")\n",
175
+ " \n",
176
+ " # Extract accuracy scores safely\n",
177
+ " completeness_acc = record.get(\"completeness\", {}).get(\"accuracy\", 0)\n",
178
+ " conciseness_acc = record.get(\"conciseness\", {}).get(\"accuracy\", 0)\n",
179
+ " attribution_acc = record.get(\"attribution\", {}).get(\"accuracy\", 0)\n",
180
+ "\n",
181
+ " parsed_data.append({\n",
182
+ " \"id\": record_id,\n",
183
+ " \"version\": version,\n",
184
+ " \"completeness\": completeness_acc,\n",
185
+ " \"conciseness\": conciseness_acc,\n",
186
+ " \"attribution\": attribution_acc\n",
187
+ " })\n",
188
+ "\n",
189
+ " # Create a pandas DataFrame\n",
190
+ " df = pd.DataFrame(parsed_data)\n",
191
+ "\n",
192
+ " # Calculate the mean scores for each version\n",
193
+ " # The order is specified to ensure 'easy', 'intermediate', 'hard' are plotted correctly\n",
194
+ " version_order = ['easy', 'intermediate', 'hard']\n",
195
+ " df['version'] = pd.Categorical(df['version'], categories=version_order, ordered=True)\n",
196
+ " \n",
197
+ " agg_results = df.groupby('version')[['completeness', 'conciseness', 'attribution']].mean().reset_index()\n",
198
+ "\n",
199
+ " print(\"--- Aggregated Mean Scores ---\")\n",
200
+ " print(agg_results.to_string(index=False))\n",
201
+ " \n",
202
+ " return agg_results\n",
203
+ "\n",
204
+ "def visualize_results(df):\n",
205
+ " \"\"\"\n",
206
+ " Generates and saves bar charts to visualize the aggregated results.\n",
207
+ " \"\"\"\n",
208
+ " if df is None or df.empty:\n",
209
+ " print(\"Cannot visualize results. DataFrame is empty.\")\n",
210
+ " return\n",
211
+ "\n",
212
+ " sns.set_style(\"whitegrid\")\n",
213
+ " fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)\n",
214
+ " fig.suptitle('Average Evaluation Metrics Across Summary Versions', fontsize=16)\n",
215
+ "\n",
216
+ " # Plot Completeness\n",
217
+ " sns.barplot(ax=axes[0], x='version', y='completeness', data=df, palette='Blues_d')\n",
218
+ " axes[0].set_title('Completeness (Claim Recall)')\n",
219
+ " axes[0].set_xlabel('Summary Version')\n",
220
+ " axes[0].set_ylabel('Average Accuracy (%)')\n",
221
+ "\n",
222
+ " # Plot Conciseness\n",
223
+ " sns.barplot(ax=axes[1], x='version', y='conciseness', data=df, palette='Greens_d')\n",
224
+ " axes[1].set_title('Conciseness (Claim Precision)')\n",
225
+ " axes[1].set_xlabel('Summary Version')\n",
226
+ " axes[1].set_ylabel('')\n",
227
+ "\n",
228
+ " # Plot Attribution\n",
229
+ " sns.barplot(ax=axes[2], x='version', y='attribution', data=df, palette='Oranges_d')\n",
230
+ " axes[2].set_title('Attribution')\n",
231
+ " axes[2].set_xlabel('Summary Version')\n",
232
+ " axes[2].set_ylabel('')\n",
233
+ " \n",
234
+ " # Improve layout and save the figure\n",
235
+ " plt.tight_layout(rect=[0, 0, 1, 0.96])\n",
236
+ " plt.savefig(\"doclens_evaluation_summary.png\", dpi=300)\n",
237
+ " print(\"\\nChart saved as 'doclens_evaluation_summary.png'\")\n",
238
+ " plt.show()\n",
239
+ "\n",
240
+ "\n",
241
+ "# --- Main Execution ---\n",
242
+ "# Replace 'your_results_file.json' with the actual path to your file\n",
243
+ "results_file = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' \n",
244
+ "aggregated_data = analyze_doclens_results(results_file)\n",
245
+ "\n",
246
+ "if aggregated_data is not None:\n",
247
+ " visualize_results(aggregated_data)"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "markdown",
252
+ "id": "b5afb981",
253
+ "metadata": {},
254
+ "source": [
255
+ "## Eliminate dataset"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 18,
261
+ "id": "b29bcf30",
262
+ "metadata": {},
263
+ "outputs": [
264
+ {
265
+ "name": "stdout",
266
+ "output_type": "stream",
267
+ "text": [
268
+ "Rejected 15 items due to low attribution.\n",
269
+ "Rejected 9 additional items due to incorrect completeness trend.\n",
270
+ "\n",
271
+ "--- Filtering Summary ---\n",
272
+ "Total unique items analyzed: 100\n",
273
+ "Items kept (High Quality): 76\n",
274
+ "Items rejected (Low Quality): 24\n",
275
+ "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json'\n",
276
+ "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n"
277
+ ]
278
+ }
279
+ ],
280
+ "source": [
281
+ "import json\n",
282
+ "import pandas as pd\n",
283
+ "\n",
284
+ "def filter_low_quality_data(file_path, attribution_threshold=80.0, completeness_trend_check=True):\n",
285
+ " \"\"\"\n",
286
+ " Loads DOCLENS results, filters out low-quality data, and returns clean/rejected data.\n",
287
+ " \"\"\"\n",
288
+ " try:\n",
289
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
290
+ " data = json.load(f)\n",
291
+ " except (FileNotFoundError, json.JSONDecodeError) as e:\n",
292
+ " print(f\"Error loading file: {e}\")\n",
293
+ " return None, None\n",
294
+ "\n",
295
+ " # --- FIX: Parse the nested JSON to extract numeric accuracy scores ---\n",
296
+ " # Create a flat list of dictionaries instead of a list of nested objects\n",
297
+ " parsed_data = []\n",
298
+ " for record in data:\n",
299
+ " parsed_data.append({\n",
300
+ " \"id\": record.get(\"id\"),\n",
301
+ " \"version\": record.get(\"version\"),\n",
302
+ " \"completeness\": record.get(\"completeness\", {}).get(\"accuracy\", 0),\n",
303
+ " \"conciseness\": record.get(\"conciseness\", {}).get(\"accuracy\", 0),\n",
304
+ " \"attribution\": record.get(\"attribution\", {}).get(\"accuracy\", 0)\n",
305
+ " })\n",
306
+ "\n",
307
+ " # Create DataFrame from the *parsed* data\n",
308
+ " df = pd.DataFrame(parsed_data)\n",
309
+ " # --------------------------------------------------------------------\n",
310
+ " \n",
311
+ " all_ids = set(df['id'].unique())\n",
312
+ " rejected_ids = set()\n",
313
+ "\n",
314
+ " # --- Pivot data for easier comparison across versions ---\n",
315
+ " # This part now works correctly because the columns are numeric\n",
316
+ " pivot_df = df.pivot_table(\n",
317
+ " index='id',\n",
318
+ " columns='version',\n",
319
+ " values=['completeness', 'conciseness', 'attribution']\n",
320
+ " )\n",
321
+ " pivot_df.columns = ['_'.join(map(str, col)).strip() for col in pivot_df.columns.values]\n",
322
+ " \n",
323
+ " # --- Filter 1: Low Attribution ---\n",
324
+ " low_attribution_mask = (pivot_df['attribution_easy'] < attribution_threshold) | \\\n",
325
+ " (pivot_df['attribution_intermediate'] < attribution_threshold) | \\\n",
326
+ " (pivot_df['attribution_hard'] < attribution_threshold)\n",
327
+ " rejected_attribution_ids = pivot_df[low_attribution_mask].index\n",
328
+ " rejected_ids.update(rejected_attribution_ids)\n",
329
+ " print(f\"Rejected {len(rejected_attribution_ids)} items due to low attribution.\")\n",
330
+ "\n",
331
+ " # --- Filter 2: Incorrect Completeness Trend ---\n",
332
+ " if completeness_trend_check:\n",
333
+ " bad_trend_mask = pivot_df['completeness_easy'] > pivot_df['completeness_hard']\n",
334
+ " rejected_trend_ids = pivot_df[bad_trend_mask].index\n",
335
+ " newly_rejected_count = len(rejected_trend_ids.difference(rejected_ids))\n",
336
+ " rejected_ids.update(rejected_trend_ids)\n",
337
+ " print(f\"Rejected {newly_rejected_count} additional items due to incorrect completeness trend.\")\n",
338
+ "\n",
339
+ " # --- Separate the data ---\n",
340
+ " clean_ids = all_ids - rejected_ids\n",
341
+ " \n",
342
+ " # We need to filter the original 'data' list, not the parsed one, to keep the full structure\n",
343
+ " original_df = pd.DataFrame(data)\n",
344
+ " clean_data = original_df[original_df['id'].isin(clean_ids)].to_dict('records')\n",
345
+ " rejected_data = original_df[original_df['id'].isin(rejected_ids)].to_dict('records')\n",
346
+ " \n",
347
+ " print(\"\\n--- Filtering Summary ---\")\n",
348
+ " print(f\"Total unique items analyzed: {len(all_ids)}\")\n",
349
+ " print(f\"Items kept (High Quality): {len(clean_ids)}\")\n",
350
+ " print(f\"Items rejected (Low Quality): {len(rejected_ids)}\")\n",
351
+ " \n",
352
+ " return clean_data, rejected_data\n",
353
+ "\n",
354
+ "def save_json(data, file_path):\n",
355
+ " \"\"\"Saves data to a JSON file.\"\"\"\n",
356
+ " with open(file_path, 'w', encoding='utf-8') as f:\n",
357
+ " json.dump(data, f, indent=4, ensure_ascii=False)\n",
358
+ " print(f\"Saved data to '{file_path}'\")\n",
359
+ "\n",
360
+ "\n",
361
+ "# --- Main Execution ---\n",
362
+ "# Replace with your file paths and desired thresholds\n",
363
+ "RESULTS_FILE = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' # Make sure this points to your file\n",
364
+ "# CLEAN_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/high_quality_dataset.json'\n",
365
+ "# REJECTED_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n",
366
+ "ATTRIBUTION_THRESHOLD = 80.0\n",
367
+ "\n",
368
+ "clean_dataset, rejected_dataset = filter_low_quality_data(\n",
369
+ " RESULTS_FILE,\n",
370
+ " attribution_threshold=ATTRIBUTION_THRESHOLD\n",
371
+ ")\n",
372
+ "\n",
373
+ "if clean_dataset is not None:\n",
374
+ " save_json(clean_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json')\n",
375
+ " save_json(rejected_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json')"
376
+ ]
377
+ }
378
+ ],
379
+ "metadata": {
380
+ "kernelspec": {
381
+ "display_name": "unsloth",
382
+ "language": "python",
383
+ "name": "python3"
384
+ },
385
+ "language_info": {
386
+ "codemirror_mode": {
387
+ "name": "ipython",
388
+ "version": 3
389
+ },
390
+ "file_extension": ".py",
391
+ "mimetype": "text/x-python",
392
+ "name": "python",
393
+ "nbconvert_exporter": "python",
394
+ "pygments_lexer": "ipython3",
395
+ "version": "3.11.11"
396
+ }
397
+ },
398
+ "nbformat": 4,
399
+ "nbformat_minor": 5
400
+ }
code/finetune-inference/subclaim_support/readctrl_model.code-workspace ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": "../../.."
5
+ },
6
+ {
7
+ "path": "../../../../LLM_guard/CKA-Agent"
8
+ },
9
+ {
10
+ "path": "../../../../readctrl_model"
11
+ }
12
+ ]
13
+ }
code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_gpt5.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import time
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import tqdm
9
+ from openai import OpenAI
10
+
11
+
12
+ # -----------------------------
13
+ # SUBCLAIM EXTRACTION PROMPT
14
+ # -----------------------------
15
+ def extraction_prompt(medical_text: str) -> str:
16
+ prompt = f"""
17
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
18
+ A subclaim is the smallest standalone factual unit that can be independently verified.
19
+
20
+ Instructions:
21
+ 1. Read the provided medical text.
22
+ 2. Break it into clear, objective, atomic subclaims.
23
+ 3. Each subclaim must come directly from the text. Do not infer or add information.
24
+ 4. Keep subclaims short, non-overlapping, and de-duplicated.
25
+ 5. Preserve numbers, units, and dates exactly as written.
26
+ 6. If the text is empty, return an empty JSON list.
27
+ 7. Return ONLY a valid JSON list of strings (no extra text).
28
+
29
+ Medical Text:
30
+ {medical_text}
31
+
32
+ Return your output in JSON list format:
33
+ [
34
+ "subclaim 1",
35
+ "subclaim 2"
36
+ ]
37
+ """
38
+ return prompt
39
+
40
+
41
+ def _load_openai_client() -> OpenAI:
42
+ api_file = "/home/mshahidul/api_new.json"
43
+ with open(api_file, "r") as f:
44
+ api_keys = json.load(f)
45
+ return OpenAI(api_key=api_keys["openai"])
46
+
47
+
48
+ def _parse_json_list(text: str) -> List[str]:
49
+ cleaned = text.replace("```json", "").replace("```", "").strip()
50
+ start_idx = cleaned.find("[")
51
+ end_idx = cleaned.rfind("]") + 1
52
+ if start_idx == -1 or end_idx <= start_idx:
53
+ raise ValueError("No JSON list found")
54
+ parsed = json.loads(cleaned[start_idx:end_idx])
55
+ if not isinstance(parsed, list):
56
+ raise ValueError("Parsed JSON is not a list")
57
+ return parsed
58
+
59
+
60
+ def infer_subclaims(
61
+ medical_text: str,
62
+ client: OpenAI,
63
+ model: str = "gpt-5-mini",
64
+ retries: int = 1,
65
+ ) -> List[str]:
66
+ if not medical_text or medical_text.strip() == "":
67
+ return []
68
+
69
+ prompt = extraction_prompt(medical_text)
70
+ try:
71
+ response = client.chat.completions.create(
72
+ model=model,
73
+ messages=[
74
+ {"role": "system", "content": "Return only a valid JSON list of strings."},
75
+ {"role": "user", "content": prompt},
76
+ ],
77
+ )
78
+ output_text = response.choices[0].message.content.strip()
79
+ return _parse_json_list(output_text)
80
+ except Exception as e:
81
+ if retries > 0:
82
+ time.sleep(1.5)
83
+ return infer_subclaims(
84
+ medical_text,
85
+ client,
86
+ model=model,
87
+ retries=retries - 1,
88
+ )
89
+ return [f"ERROR: {str(e)}"]
90
+
91
+
92
+ # -----------------------------
93
+ # MAIN EXECUTION
94
+ # -----------------------------
95
+ if __name__ == "__main__":
96
+ parser = argparse.ArgumentParser()
97
+ parser.add_argument(
98
+ "--input_file",
99
+ type=str,
100
+ default="/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/combine/verified_combined_0-80.json",
101
+ )
102
+ parser.add_argument(
103
+ "--save_folder",
104
+ type=str,
105
+ default="/home/mshahidul/readctrl/data/extracting_subclaim",
106
+ )
107
+ parser.add_argument("--model", type=str, default="gpt-5-mini")
108
+ args = parser.parse_args()
109
+
110
+ input_file = args.input_file
111
+ save_folder = args.save_folder
112
+ file_name = os.path.basename(input_file).split(".json")[0]
113
+ output_file = os.path.join(save_folder, f"extracted_subclaims_{file_name}.json")
114
+
115
+ Path(save_folder).mkdir(parents=True, exist_ok=True)
116
+ client = _load_openai_client()
117
+
118
+ with open(input_file, "r") as f:
119
+ data = json.load(f)
120
+
121
+ result = []
122
+ if os.path.exists(output_file):
123
+ with open(output_file, "r") as f:
124
+ result = json.load(f)
125
+
126
+ def _item_key(obj: dict) -> str:
127
+ if obj.get("index") is not None:
128
+ return str(obj.get("index"))
129
+ if obj.get("id") is not None:
130
+ return str(obj.get("id"))
131
+ if obj.get("doc_id") is not None and obj.get("label") is not None:
132
+ return f"{obj.get('doc_id')}_{obj.get('label')}"
133
+ return str(obj.get("doc_id") or obj.get("label") or "")
134
+
135
+ processed_data = {_item_key(item): item for item in result}
136
+
137
+ for item in tqdm.tqdm(data):
138
+ item_id = _item_key(item)
139
+ existing_entry = processed_data.get(item_id)
140
+
141
+ # 1. Process Fulltext
142
+ if not existing_entry or not isinstance(existing_entry.get("fulltext_subclaims"), list):
143
+ f_sub = infer_subclaims(
144
+ item.get("fulltext", ""),
145
+ client,
146
+ model=args.model,
147
+ retries=2,
148
+ )
149
+ else:
150
+ f_sub = existing_entry["fulltext_subclaims"]
151
+
152
+ # 2. Process Summary
153
+ if not existing_entry or not isinstance(existing_entry.get("summary_subclaims"), list):
154
+ s_sub = infer_subclaims(
155
+ item.get("summary", ""),
156
+ client,
157
+ model=args.model,
158
+ retries=1,
159
+ )
160
+ else:
161
+ s_sub = existing_entry["summary_subclaims"]
162
+
163
+ # 3. Process Generated Texts (diff_label_texts)
164
+ diff_label_texts = item.get("diff_label_texts", "")
165
+ if isinstance(diff_label_texts, dict):
166
+ diff_label_subclaims = existing_entry.get("diff_label_subclaims", {}) if existing_entry else {}
167
+ for label, text in diff_label_texts.items():
168
+ if label not in diff_label_subclaims or not isinstance(diff_label_subclaims[label], list):
169
+ diff_label_subclaims[label] = infer_subclaims(
170
+ text,
171
+ client,
172
+ model=args.model,
173
+ retries=1,
174
+ )
175
+ else:
176
+ if not existing_entry or not isinstance(existing_entry.get("diff_label_subclaims"), list):
177
+ diff_label_subclaims = infer_subclaims(
178
+ diff_label_texts,
179
+ client,
180
+ model=args.model,
181
+ retries=1,
182
+ )
183
+ else:
184
+ diff_label_subclaims = existing_entry["diff_label_subclaims"]
185
+
186
+ # 4. Save
187
+ new_entry = {
188
+ "doc_id": item.get("doc_id"),
189
+ "label": item.get("label"),
190
+ "fulltext": item.get("fulltext", ""),
191
+ "fulltext_subclaims": f_sub,
192
+ "summary": item.get("summary", ""),
193
+ "summary_subclaims": s_sub,
194
+ "diff_label_texts": diff_label_texts,
195
+ "diff_label_subclaims": diff_label_subclaims,
196
+ }
197
+ processed_data[item_id] = new_entry
198
+
199
+ if len(processed_data) % 10 == 0:
200
+ with open(output_file, "w") as f:
201
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
202
+
203
+ with open(output_file, "w") as f:
204
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
205
+
206
+ print(f"Extraction completed. File saved at: {output_file}")
code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_v4.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Set GPU environment variables
3
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
5
+ import torch
6
+ from unsloth import FastLanguageModel
7
+ import json
8
+ import tqdm
9
+ import argparse
10
+
11
+
12
+ # -----------------------------
13
+ # MODEL CACHE
14
+ # -----------------------------
15
+ _model_cache = {"model": None, "tokenizer": None}
16
+
17
+ def load_finetuned_model(model_path: str):
18
+ if _model_cache["model"] is not None:
19
+ return _model_cache["model"], _model_cache["tokenizer"]
20
+
21
+ model, tokenizer = FastLanguageModel.from_pretrained(
22
+ model_name=model_path,
23
+ max_seq_length=8192,
24
+ load_in_4bit=False,
25
+ load_in_8bit=False,
26
+ full_finetuning=False,
27
+ )
28
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
29
+ return model, tokenizer
30
+
31
+ # -----------------------------
32
+ # SUBCLAIM EXTRACTION PROMPT
33
+ # -----------------------------
34
+ def extraction_prompt(medical_text: str) -> str:
35
+ prompt = f"""
36
+ You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text.
37
+ A subclaim is the smallest standalone factual unit that can be independently verified.
38
+
39
+ Instructions:
40
+ 1. Read the provided medical text.
41
+ 2. Break it into clear, objective, atomic subclaims.
42
+ 3. Each subclaim must come directly from the text.
43
+ 4. Return ONLY a valid JSON list of strings.
44
+
45
+ Medical Text:
46
+ {medical_text}
47
+
48
+ Return your output in JSON list format:
49
+ [
50
+ "subclaim 1",
51
+ "subclaim 2"
52
+ ]
53
+ """
54
+ return prompt
55
+ # -----------------------------
56
+ # INFERENCE FUNCTION WITH AUTO-RETRY
57
+ # -----------------------------
58
+ def infer_subclaims(medical_text: str, model, tokenizer, temperature: float = 0.2, max_tokens: int = 2048, retries: int = 1) -> list:
59
+ if not medical_text or medical_text.strip() == "":
60
+ return []
61
+
62
+ prompt = extraction_prompt(medical_text)
63
+ messages = [{"role": "user", "content": prompt}]
64
+ chat_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
65
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
66
+
67
+ with torch.no_grad():
68
+ output_ids = model.generate(
69
+ **inputs,
70
+ max_new_tokens=max_tokens,
71
+ temperature=temperature,
72
+ do_sample=False
73
+ )
74
+
75
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
76
+
77
+ # Remove reasoning if model is a "Thinker" model
78
+ if "</think>" in output_text:
79
+ output_text = output_text.split("</think>")[-1].strip()
80
+
81
+ # JSON Parsing Logic
82
+ try:
83
+ start_idx = output_text.find('[')
84
+ end_idx = output_text.rfind(']') + 1
85
+
86
+ # Check if we have a complete bracketed pair
87
+ if start_idx != -1 and end_idx > start_idx:
88
+ content = output_text[start_idx:end_idx]
89
+ parsed = json.loads(content)
90
+ if isinstance(parsed, list):
91
+ return parsed
92
+
93
+ # If we are here, it means parsing failed or brackets were incomplete (truncation)
94
+ raise ValueError("Incomplete JSON list")
95
+
96
+ except (json.JSONDecodeError, ValueError):
97
+ # If truncation happened and we have retries left, double the tokens
98
+ if retries > 0:
99
+ new_max = max_tokens + 2048 # Increment by 2k tokens
100
+ print(f"\n[Warning] Truncation detected. Retrying with {new_max} tokens...")
101
+ return infer_subclaims(medical_text, model, tokenizer, temperature, max_tokens=new_max, retries=retries-1)
102
+
103
+ # Final fallback: return the raw text wrapped in a list so the pipeline doesn't crash
104
+ return [output_text]
105
+
106
+ # -----------------------------
107
+ # MAIN EXECUTION
108
+ # -----------------------------
109
+ if __name__ == "__main__":
110
+ parser = argparse.ArgumentParser()
111
+ parser.add_argument("--input_file", type=str, required=True)
112
+ args = parser.parse_args()
113
+
114
+ INPUT_FILE = args.input_file
115
+ file_name = os.path.basename(INPUT_FILE).split(".json")[0]
116
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
117
+ MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
118
+
119
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
120
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}.json")
121
+
122
+ model, tokenizer = load_finetuned_model(MODEL_PATH)
123
+
124
+ with open(INPUT_FILE, "r") as f:
125
+ data = json.load(f)
126
+
127
+ result = []
128
+ if os.path.exists(OUTPUT_FILE):
129
+ with open(OUTPUT_FILE, "r") as f:
130
+ result = json.load(f)
131
+
132
+ processed_data = {str(item.get("index") or item.get("id")): item for item in result}
133
+
134
+ for item in tqdm.tqdm(data):
135
+ item_id = str(item.get("index") if item.get("index") is not None else item.get("id"))
136
+ existing_entry = processed_data.get(item_id)
137
+
138
+ # 1. Process Fulltext (The longest field, high initial token count)
139
+ if not existing_entry or not isinstance(existing_entry.get("fulltext_subclaims"), list):
140
+ f_sub = infer_subclaims(item.get("fulltext", ""), model, tokenizer, max_tokens=3072, retries=2)
141
+ else:
142
+ f_sub = existing_entry["fulltext_subclaims"]
143
+
144
+ # 2. Process Summary
145
+ if not existing_entry or not isinstance(existing_entry.get("summary_subclaims"), list):
146
+ s_sub = infer_subclaims(item.get("summary", ""), model, tokenizer, max_tokens=2048, retries=1)
147
+ else:
148
+ s_sub = existing_entry["summary_subclaims"]
149
+
150
+ # 3. Process All Generated Texts (diff_label_texts)
151
+ diff_label_texts = item.get("diff_label_texts", {})
152
+ diff_label_subclaims = existing_entry.get("diff_label_subclaims", {}) if existing_entry else {}
153
+
154
+ for label, text in diff_label_texts.items():
155
+ if label not in diff_label_subclaims or not isinstance(diff_label_subclaims[label], list):
156
+ # Generated texts are shorter, but we still allow 1 retry
157
+ diff_label_subclaims[label] = infer_subclaims(text, model, tokenizer, max_tokens=1536, retries=1)
158
+
159
+ # 4. Save
160
+ new_entry = {
161
+ "index": item.get("index"),
162
+ "id": item.get("id"),
163
+ "fulltext": item.get("fulltext", ""),
164
+ "fulltext_subclaims": f_sub,
165
+ "summary": item.get("summary", ""),
166
+ "summary_subclaims": s_sub,
167
+ "diff_label_texts": diff_label_texts,
168
+ "diff_label_subclaims": diff_label_subclaims,
169
+ "readability_score": item.get("readability_score", None)
170
+ }
171
+ processed_data[item_id] = new_entry
172
+
173
+ if len(processed_data) % 10 == 0:
174
+ with open(OUTPUT_FILE, "w") as f:
175
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
176
+
177
+ with open(OUTPUT_FILE, "w") as f:
178
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
179
+
180
+ print(f"Extraction completed. File saved at: {OUTPUT_FILE}")
code/finetune-inference/subclaim_support_extraction/inference_extract_subclaims_vllm.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # API CONFIGURATION
9
+ # -----------------------------
10
+ LOCAL_API_URL = "http://172.16.34.29:8004/v1"
11
+ LOCAL_MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-extraction-8b_ctx_fp16"
12
+
13
+ client = OpenAI(
14
+ base_url=LOCAL_API_URL,
15
+ api_key="EMPTY"
16
+ )
17
+
18
+ # -----------------------------
19
+ # SUBCLAIM EXTRACTION PROMPT
20
+ # -----------------------------
21
+ def extraction_prompt(medical_text: str) -> str:
22
+ return f"""
23
+ You are an expert medical annotator.
24
+
25
+ Your task is to extract granular, factual subclaims from the provided medical text.
26
+ A subclaim is the smallest standalone factual unit that can be independently verified.
27
+
28
+ Instructions:
29
+ 1. Read the medical text carefully.
30
+ 2. Extract factual statements explicitly stated in the text.
31
+ 3. Each subclaim must:
32
+ - Contain exactly ONE factual assertion
33
+ - Come directly from the text (no inference or interpretation)
34
+ - Preserve original wording as much as possible
35
+ - Include any negation, uncertainty, or qualifier (e.g., "may", "not", "suggests")
36
+ 4. Do NOT:
37
+ - Combine multiple facts into one subclaim
38
+ - Add new information
39
+ - Rephrase or normalize terminology
40
+ - Include opinions or recommendations
41
+ 5. Return ONLY a valid JSON array of strings.
42
+ 6. Use double quotes and valid JSON formatting only (no markdown, no commentary).
43
+
44
+ Medical Text:
45
+ {medical_text}
46
+
47
+ Return format:
48
+ [
49
+ "subclaim 1",
50
+ "subclaim 2"
51
+ ]
52
+ """.strip()
53
+
54
+
55
+ # -----------------------------
56
+ # INFERENCE FUNCTION (vLLM API)
57
+ # -----------------------------
58
+ def infer_subclaims_api(medical_text: str, temperature: float = 0.2, max_tokens: int = 2048, retries: int = 1) -> list:
59
+ if not medical_text or not medical_text.strip():
60
+ return []
61
+
62
+ prompt = extraction_prompt(medical_text)
63
+
64
+ try:
65
+ response = client.chat.completions.create(
66
+ model=LOCAL_MODEL_NAME,
67
+ messages=[{"role": "user", "content": prompt}],
68
+ temperature=temperature,
69
+ max_tokens=max_tokens,
70
+ )
71
+
72
+ output_text = response.choices[0].message.content.strip()
73
+
74
+ if "</think>" in output_text:
75
+ output_text = output_text.split("</think>")[-1].strip()
76
+
77
+ start_idx = output_text.find('[')
78
+ end_idx = output_text.rfind(']') + 1
79
+
80
+ if start_idx != -1 and end_idx > start_idx:
81
+ content = output_text[start_idx:end_idx]
82
+ parsed = json.loads(content)
83
+ if isinstance(parsed, list):
84
+ return parsed
85
+
86
+ raise ValueError("Incomplete JSON list")
87
+
88
+ except (json.JSONDecodeError, ValueError, Exception) as e:
89
+ if retries > 0:
90
+ new_max = max_tokens + 2048
91
+ print(f"\n[Warning] API error/truncation: {e}. Retrying with {new_max} tokens...")
92
+ return infer_subclaims_api(medical_text, temperature, max_tokens=new_max, retries=retries-1)
93
+
94
+ return [output_text] if 'output_text' in locals() else []
95
+
96
+ # -----------------------------
97
+ # MAIN EXECUTION
98
+ # -----------------------------
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser()
101
+ parser.add_argument("--input_file", type=str, required=True)
102
+ parser.add_argument("--start", type=int, default=0, help="Start index in the dataset")
103
+ parser.add_argument("--end", type=int, default=None, help="End index (exclusive) in the dataset")
104
+ args = parser.parse_args()
105
+
106
+ INPUT_FILE = args.input_file
107
+ file_name = os.path.basename(INPUT_FILE).split(".json")[0]
108
+ SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim"
109
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
110
+
111
+ # Range-specific output naming helps if you want to run parallel jobs
112
+ range_suffix = f"_{args.start}_{args.end if args.end is not None else 'end'}"
113
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}{range_suffix}.json")
114
+
115
+ with open(INPUT_FILE, "r") as f:
116
+ full_data = json.load(f)
117
+
118
+ if args.end is None:
119
+ args.end = len(full_data)
120
+
121
+ # Slice the data based on user input
122
+ data_subset = full_data[args.start:args.end]
123
+ print(f"Processing range [{args.start} : {args.end if args.end else len(full_data)}]. Total: {len(data_subset)} items.")
124
+
125
+ # Load existing progress if available
126
+ processed_data = {}
127
+ if os.path.exists(OUTPUT_FILE):
128
+ with open(OUTPUT_FILE, "r") as f:
129
+ existing_list = json.load(f)
130
+ processed_data = {str(item.get("id")): item for item in existing_list}
131
+
132
+ for item in tqdm.tqdm(data_subset):
133
+ item_id = str(item.get("id"))
134
+
135
+ # Check if this item in the subset was already processed
136
+ if item_id in processed_data:
137
+ continue
138
+
139
+ # 1. Process Fulltext
140
+ f_sub = infer_subclaims_api(item.get("fulltext", ""), max_tokens=3072, retries=2)
141
+
142
+ # 2. Process Summary
143
+ s_sub = infer_subclaims_api(item.get("summary", ""), max_tokens=2048, retries=1)
144
+
145
+ # 3. Save Entry
146
+ processed_data[item_id] = {
147
+ "id": item_id,
148
+ "fulltext": item.get("fulltext", ""),
149
+ "fulltext_subclaims": f_sub,
150
+ "summary": item.get("summary", ""),
151
+ "summary_subclaims": s_sub
152
+ }
153
+
154
+ # Periodic checkpoint
155
+ if len(processed_data) % 20 == 0:
156
+ with open(OUTPUT_FILE, "w") as f:
157
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
158
+
159
+ # Final Save
160
+ with open(OUTPUT_FILE, "w") as f:
161
+ json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False)
162
+
163
+ print(f"Range extraction completed. File saved at: {OUTPUT_FILE}")
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-4b_ctx-bf16"
11
+ API_URL = "http://localhost:8015/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # VERIFICATION PROMPT
18
+ # -----------------------------
19
+ def inference_prompt(text, subclaim):
20
+ return f"""
21
+ You are a medical evidence evaluator.
22
+
23
+ Determine the relationship between the following medical text and the subclaim.
24
+
25
+ Label definitions:
26
+ - supported: the text directly provides evidence for the subclaim
27
+ - refuted: the text contradicts the subclaim
28
+ - not_supported: the text is related to the subclaim but does not provide evidence
29
+
30
+
31
+ Medical Text:
32
+ {text}
33
+
34
+ Subclaim:
35
+ {subclaim}
36
+
37
+ Respond only with one label: supported, refuted, or not_supported.
38
+ Give output without extra explanation.
39
+ """
40
+
41
+ # -----------------------------
42
+ # VERIFICATION LOGIC
43
+ # -----------------------------
44
+ def check_support(text: str, subclaim: str) -> str:
45
+ """
46
+ Returns: 'supported', 'refuted', or 'not_supported'
47
+ """
48
+ if not text or not subclaim:
49
+ return "not_supported"
50
+
51
+ prompt = inference_prompt(text, subclaim)
52
+
53
+ try:
54
+ response = client.chat.completions.create(
55
+ model=MODEL_PATH,
56
+ messages=[{"role": "user", "content": prompt}],
57
+ max_tokens=20,
58
+ temperature=0.0,
59
+ )
60
+ res = response.choices[0].message.content.strip().lower()
61
+
62
+ if "not_supported" in res:
63
+ return "not_supported"
64
+ elif "supported" in res:
65
+ return "supported"
66
+ elif "refuted" in res:
67
+ return "refuted"
68
+ else:
69
+ return "not_supported"
70
+
71
+ except Exception as e:
72
+ print(f"API error: {e}")
73
+ return "not_supported"
74
+
75
+ def calculate_metric(subclaims_list: list, reference_text: str, metric_name: str):
76
+ if not subclaims_list:
77
+ return {"score": 0.0, "details": []}
78
+
79
+ results = []
80
+ supported_count = 0
81
+
82
+ for subclaim in subclaims_list:
83
+ label = check_support(reference_text, subclaim)
84
+ is_supported = (label == "supported")
85
+
86
+ if is_supported:
87
+ supported_count += 1
88
+
89
+ results.append({
90
+ "subclaim": subclaim,
91
+ "label": label
92
+ })
93
+
94
+ score = supported_count / len(subclaims_list) if len(subclaims_list) > 0 else 0.0
95
+
96
+ return {
97
+ "score": score,
98
+ "details": results
99
+ }
100
+
101
+ # -----------------------------
102
+ # MAIN
103
+ # -----------------------------
104
+ if __name__ == "__main__":
105
+ parser = argparse.ArgumentParser()
106
+ parser.add_argument("--input_file", type=str,
107
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json",
108
+ help="Path to input JSON with subclaims")
109
+
110
+ parser.add_argument("--save_folder", type=str,
111
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_cal_v2",
112
+ help="Folder to save results")
113
+
114
+ # Range arguments
115
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
116
+ parser.add_argument("--end_index", type=int, default=-1, help="End index (exclusive). -1 for all.")
117
+
118
+ args = parser.parse_args()
119
+
120
+ INPUT_FILE = args.input_file
121
+ SAVE_FOLDER = args.save_folder
122
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
123
+
124
+ # -----------------------------
125
+ # Load Data
126
+ # -----------------------------
127
+ print(f"Loading data from {INPUT_FILE}...")
128
+ with open(INPUT_FILE, "r") as f:
129
+ all_data = json.load(f)
130
+
131
+ # -----------------------------
132
+ # Slice Data based on Range
133
+ # -----------------------------
134
+ total_len = len(all_data)
135
+ start = args.start_index
136
+ end = args.end_index if args.end_index != -1 else total_len
137
+
138
+ # Ensure end doesn't exceed total length
139
+ if end > total_len:
140
+ end = total_len
141
+
142
+ data_slice = all_data[start:end]
143
+
144
+ print(f"Total dataset size: {total_len}")
145
+ print(f"Processing range: {start} to {end}")
146
+ print(f"Items in this batch: {len(data_slice)}")
147
+
148
+ # -----------------------------
149
+ # Output Filename (includes range)
150
+ # -----------------------------
151
+ # Filename format: evaluated_metrics_0_100.json
152
+ OUTPUT_FILE = os.path.join(
153
+ SAVE_FOLDER,
154
+ f"evaluated_metrics_{start}_{end}.json"
155
+ )
156
+
157
+ # -----------------------------
158
+ # Resume Logic
159
+ # -----------------------------
160
+ processed_results = []
161
+ if os.path.exists(OUTPUT_FILE):
162
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
163
+ try:
164
+ with open(OUTPUT_FILE, "r") as f:
165
+ processed_results = json.load(f)
166
+ except:
167
+ processed_results = []
168
+
169
+ processed_ids = {item['id'] for item in processed_results}
170
+
171
+ # Filter only the sliced data
172
+ to_process = [item for item in data_slice if item['id'] not in processed_ids]
173
+
174
+ print(f"Already processed in this file: {len(processed_ids)}")
175
+ print(f"Remaining to process: {len(to_process)}")
176
+
177
+ # -----------------------------
178
+ # Processing Loop
179
+ # -----------------------------
180
+ for item in tqdm.tqdm(to_process):
181
+
182
+ # 1. Prepare Texts
183
+ easy_text = item.get("easy_text", "")
184
+ inter_text = item.get("intermediate_text", "")
185
+ hard_text = item.get("hard_text", "")
186
+ fulltext = item.get("fulltext", "")
187
+ summary = item.get("summary", "")
188
+
189
+ # 2. Prepare Subclaim Lists
190
+ def ensure_list(x): return x if isinstance(x, list) else []
191
+
192
+ easy_subs = ensure_list(item.get("easy_subclaims", []))
193
+ inter_subs = ensure_list(item.get("intermediate_subclaims", []))
194
+ hard_subs = ensure_list(item.get("hard_subclaims", []))
195
+ full_subs = ensure_list(item.get("fulltext_subclaims", []))
196
+ summary_subs = ensure_list(item.get("summary_subclaims", []))
197
+
198
+ # ---------------------------------------------------------
199
+ # METRICS CALCULATION
200
+ # ---------------------------------------------------------
201
+
202
+ # Attribution: Generated Subclaims -> Full Text
203
+ attr_easy = calculate_metric(easy_subs, fulltext, "attribution")
204
+ attr_inter = calculate_metric(inter_subs, fulltext, "attribution")
205
+ attr_hard = calculate_metric(hard_subs, fulltext, "attribution")
206
+
207
+ # Conciseness: Generated Subclaims -> Summary Text
208
+ conc_easy = calculate_metric(easy_subs, summary, "conciseness")
209
+ conc_inter = calculate_metric(inter_subs, summary, "conciseness")
210
+ conc_hard = calculate_metric(hard_subs, summary, "conciseness")
211
+
212
+ # Completeness: summary Subclaims -> Generated Text
213
+ comp_easy = calculate_metric(summary_subs, easy_text, "completeness")
214
+ comp_inter = calculate_metric(summary_subs, inter_text, "completeness")
215
+ comp_hard = calculate_metric(summary_subs, hard_text, "completeness")
216
+
217
+ # Construct Output
218
+ result_item = item.copy()
219
+ result_item["metrics"] = {
220
+ "easy": {
221
+ "attribution": attr_easy,
222
+ "conciseness": conc_easy,
223
+ "completeness": comp_easy
224
+ },
225
+ "intermediate": {
226
+ "attribution": attr_inter,
227
+ "conciseness": conc_inter,
228
+ "completeness": comp_inter
229
+ },
230
+ "hard": {
231
+ "attribution": attr_hard,
232
+ "conciseness": conc_hard,
233
+ "completeness": comp_hard
234
+ }
235
+ }
236
+
237
+ processed_results.append(result_item)
238
+
239
+ # Save frequently
240
+ if len(processed_results) % 20 == 0:
241
+ with open(OUTPUT_FILE, "w") as f:
242
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
243
+
244
+ # Final Save
245
+ with open(OUTPUT_FILE, "w") as f:
246
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
247
+
248
+ print(f"Evaluation for range {start}:{end} complete. Saved to: {OUTPUT_FILE}")
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_tesing_v2.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx-bf16"
11
+ API_URL = "http://172.16.34.29:8004/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # VERIFICATION PROMPT
18
+ # -----------------------------
19
+ def inference_prompt(text, subclaim):
20
+ return f"""You are a clinical evidence auditor. Your evaluation must be based STRICTLY and ONLY on the provided medical text.
21
+
22
+ ### MANDATORY GROUNDING RULES:
23
+ 1. NO OUTSIDE KNOWLEDGE: Do not use your internal medical knowledge. Even if a subclaim is "common sense" in medicine, if it is not explicitly in the TEXT, it is 'not_supported'.
24
+ 2. NO LOGICAL LEAPS: Do not bridge gaps in logic. (e.g., If the text mentions "high blood sugar" but not the word "diabetes", you cannot support a claim of "diabetes").
25
+ 3. EXACT NUMERICAL MATCHING: Any doses (e.g., 500mg), frequencies (e.g., twice daily), or durations (e.g., 10 days) mentioned in the subclaim must match the text perfectly. If they are missing or different in the text, label as 'not_supported'.
26
+ 4. DEFAULT TO NOT SUPPORTED: If the text is vague, ambiguous, or only suggests a possibility, you MUST choose 'not_supported'.
27
+ 5. CLOSED-WORLD REALITY: Treat the TEXT as the only information that exists in the world.
28
+
29
+ ### Medical Text:
30
+ {text}
31
+
32
+ ### Subclaim:
33
+ {subclaim}
34
+
35
+ Output exactly one word ('supported' or 'not_supported') based on the strict rules above:"""
36
+
37
+ # -----------------------------
38
+ # VERIFICATION LOGIC
39
+ # -----------------------------
40
+ def check_support(text: str, subclaim: str, error_log=None) -> str:
41
+ """
42
+ Returns: 'supported', 'refuted', or 'not_supported'
43
+ Tracks errors in error_log if provided.
44
+ """
45
+ if not text or not subclaim:
46
+ return "not_supported"
47
+
48
+ prompt = inference_prompt(text, subclaim)
49
+
50
+ try:
51
+ response = client.chat.completions.create(
52
+ model=MODEL_PATH,
53
+ messages=[{"role": "user", "content": prompt}],
54
+ max_tokens=512,
55
+ temperature=0.1,
56
+ )
57
+ res = response.choices[0].message.content
58
+ if "</think>" in res:
59
+ res = res.split("</think>")[1].strip().lower()
60
+ else:
61
+ res = response.choices[0].message.content.strip().lower()
62
+
63
+ if "not_supported" in res:
64
+ return "not_supported"
65
+ elif "supported" in res:
66
+ return "supported"
67
+ elif "refuted" in res:
68
+ return "refuted"
69
+ else:
70
+ return "not_supported"
71
+
72
+ except Exception as e:
73
+ # --- ERROR TRACKING ---
74
+ if error_log is not None:
75
+ error_details = {
76
+ "subclaim": subclaim,
77
+ "error_msg": str(e),
78
+ "type": "API_ERROR"
79
+ }
80
+ error_log.append(error_details)
81
+ # ----------------------
82
+
83
+ # Optional: Print to console so you see it happening live
84
+ # print(f"\n[!] Error on ID {item_id}: {e}")
85
+ return "not_supported"
86
+
87
+
88
+
89
+ # -----------------------------
90
+ # MAIN
91
+ # -----------------------------
92
+ if __name__ == "__main__":
93
+ parser = argparse.ArgumentParser()
94
+ parser.add_argument("--input_file", type=str,
95
+ default="/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json",
96
+ help="Path to input JSON with subclaims")
97
+
98
+ parser.add_argument("--save_folder", type=str,
99
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_testing",
100
+ help="Folder to save results")
101
+
102
+ # Range arguments
103
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
104
+ parser.add_argument("--end_index", type=int, default=-1, help="End index (exclusive). -1 for all.")
105
+
106
+ args = parser.parse_args()
107
+
108
+ INPUT_FILE = args.input_file
109
+ SAVE_FOLDER = args.save_folder
110
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
111
+
112
+ # -----------------------------
113
+ # Load Data
114
+ # -----------------------------
115
+ print(f"Loading data from {INPUT_FILE}...")
116
+ with open(INPUT_FILE, "r") as f:
117
+ all_data = json.load(f)
118
+
119
+ # -----------------------------
120
+ # Slice Data based on Range
121
+ # -----------------------------
122
+ total_len = len(all_data)
123
+ start = args.start_index
124
+ end = args.end_index if args.end_index != -1 else total_len
125
+
126
+ if end > total_len:
127
+ end = total_len
128
+
129
+ data_slice = all_data[start:end]
130
+
131
+ print(f"Total dataset size: {total_len}")
132
+ print(f"Processing range: {start} to {end}")
133
+ print(f"Items in this batch: {len(data_slice)}")
134
+
135
+ # -----------------------------
136
+ # Output Files
137
+ # -----------------------------
138
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}_qwen3_32B_v2.json")
139
+
140
+
141
+ # -----------------------------
142
+ # Resume Logic
143
+ # -----------------------------
144
+ processed_results = []
145
+ if os.path.exists(OUTPUT_FILE):
146
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
147
+ try:
148
+ with open(OUTPUT_FILE, "r") as f:
149
+ processed_results = json.load(f)
150
+ except:
151
+ processed_results = []
152
+
153
+ processed_ids = {item['full_text'] for item in processed_results}
154
+ to_process = [item for item in data_slice if item['full_text'] not in processed_ids]
155
+
156
+ print(f"Already processed in this file: {len(processed_ids)}")
157
+ print(f"Remaining to process: {len(to_process)}")
158
+
159
+ # -----------------------------
160
+ # Initialize Error Tracker
161
+ # -----------------------------
162
+ global_error_log = []
163
+
164
+ # -----------------------------
165
+ # Processing Loop
166
+ # -----------------------------
167
+ # Added tqdm postfix to show error count in real-time
168
+ pbar = tqdm.tqdm(to_process)
169
+
170
+ for item in pbar:
171
+ text=item.get('full_text', '')
172
+ subclaims=item.get('dat', [])['dat']
173
+ # import ipdb; ipdb.set_trace()
174
+ for subclaim in subclaims:
175
+ subclaim_text=subclaim.get('subclaim', '')
176
+ label_gt=subclaim.get('status', 'not_supported').strip().lower()
177
+ correctness=False
178
+
179
+ label_gen=check_support(text, subclaim_text, error_log=global_error_log)
180
+ # import ipdb; ipdb.set_trace()
181
+ if "not_supported" == label_gen and "not_supported" == label_gt:
182
+ correctness=True
183
+ elif "supported" == label_gen and "supported" == label_gt:
184
+ correctness=True
185
+ else:
186
+ # print(f"Mismatch:\nGT: {label_gt}\nGEN: {label_gen}\nSubclaim: {subclaim}\nText: {text}\n---")
187
+ pass
188
+ result_entry={
189
+ "medical_text": text,
190
+ "subclaim": subclaim,
191
+ "label_gt": label_gt,
192
+ "label_gen": label_gen,
193
+ "correctness": correctness
194
+ }
195
+ processed_results.append(result_entry)
196
+ if len(processed_results) % 2 == 0:
197
+ # Save intermediate results
198
+ with open(OUTPUT_FILE, "w") as f:
199
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
200
+
201
+
202
+ with open(OUTPUT_FILE, "w") as f:
203
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v2.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx-bf16"
11
+ API_URL = "http://localhost:8015/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # VERIFICATION PROMPT
18
+ # -----------------------------
19
+ def inference_prompt(text, subclaim):
20
+ return f"""
21
+ You are a precise, conservative medical evidence evaluator.
22
+
23
+ Your task:
24
+ Determine the relationship between the following MEDICAL TEXT and the SUBCLAIM.
25
+
26
+ Use ONLY these labels (lowercase):
27
+ - supported → the TEXT clearly supports the SUBCLAIM. The information is
28
+ explicitly stated or follows from a very direct and
29
+ unambiguous medical inference (e.g., “fiebre de 39°C”
30
+ supports “tenía fiebre”).
31
+ - refuted → the TEXT clearly contradicts the SUBCLAIM (e.g., the TEXT
32
+ states the opposite, or provides mutually exclusive values:
33
+ different drug, dose, duration, time point, diagnosis, etc.).
34
+ - not_supported → the TEXT is related to the SUBCLAIM but does NOT provide
35
+ enough evidence to mark it as supported or refuted
36
+ (e.g., missing or different dose, duration, timing,
37
+ route, frequency, or diagnosis; or the claim simply
38
+ is not mentioned).
39
+
40
+ Important instructions:
41
+ - Be STRICT and CONSERVATIVE:
42
+ - If exact numerical details (dose, time, duration, frequency, age, etc.)
43
+ in the SUBCLAIM are not explicitly stated or clearly implied in the TEXT,
44
+ choose not_supported.
45
+ - Do NOT assume or infer information beyond what is clearly supported by
46
+ the TEXT, even if it seems medically plausible.
47
+ - Use refuted ONLY when there is a clear contradiction between TEXT and
48
+ SUBCLAIM.
49
+ - Ignore your external medical knowledge; base your decision ONLY on the TEXT.
50
+ - The TEXT and SUBCLAIM may be in Spanish; evaluate them as written.
51
+
52
+ Medical Text:
53
+ {text}
54
+
55
+ Subclaim:
56
+ {subclaim}
57
+
58
+ Respond with exactly ONE label:
59
+ supported
60
+ refuted
61
+ not_supported
62
+ """
63
+
64
+ # -----------------------------
65
+ # VERIFICATION LOGIC
66
+ # -----------------------------
67
+ def check_support(text: str, subclaim: str, item_id=None, error_log=None) -> str:
68
+ """
69
+ Returns: 'supported', 'refuted', or 'not_supported'
70
+ Tracks errors in error_log if provided.
71
+ """
72
+ if not text or not subclaim:
73
+ return "not_supported"
74
+
75
+ prompt = inference_prompt(text, subclaim)
76
+
77
+ try:
78
+ response = client.chat.completions.create(
79
+ model=MODEL_PATH,
80
+ messages=[{"role": "user", "content": prompt}],
81
+ max_tokens=20,
82
+ temperature=0.0,
83
+ )
84
+ res = response.choices[0].message.content.strip().lower()
85
+
86
+ if "not_supported" in res:
87
+ return "not_supported"
88
+ elif "supported" in res:
89
+ return "supported"
90
+ elif "refuted" in res:
91
+ return "refuted"
92
+ else:
93
+ return "not_supported"
94
+
95
+ except Exception as e:
96
+ # --- ERROR TRACKING ---
97
+ if error_log is not None:
98
+ error_details = {
99
+ "id": item_id,
100
+ "subclaim": subclaim,
101
+ "error_msg": str(e),
102
+ "type": "API_ERROR"
103
+ }
104
+ error_log.append(error_details)
105
+ # ----------------------
106
+
107
+ # Optional: Print to console so you see it happening live
108
+ print(f"\n[!] Error on ID {item_id}: {e}")
109
+ return "not_supported"
110
+
111
+ def calculate_metric(subclaims_list: list, reference_text: str, metric_name: str, item_id=None, error_log=None):
112
+ if not subclaims_list:
113
+ return {"score": 0.0, "details": []}
114
+
115
+ results = []
116
+ supported_count = 0
117
+
118
+ for subclaim in subclaims_list:
119
+ # Pass tracking info down to check_support
120
+ label = check_support(reference_text, subclaim, item_id=item_id, error_log=error_log)
121
+
122
+ is_supported = (label == "supported")
123
+
124
+ if is_supported:
125
+ supported_count += 1
126
+
127
+ results.append({
128
+ "subclaim": subclaim,
129
+ "label": label
130
+ })
131
+
132
+ score = supported_count / len(subclaims_list) if len(subclaims_list) > 0 else 0.0
133
+
134
+ return {
135
+ "score": score,
136
+ "details": results
137
+ }
138
+
139
+ # -----------------------------
140
+ # MAIN
141
+ # -----------------------------
142
+ if __name__ == "__main__":
143
+ parser = argparse.ArgumentParser()
144
+ parser.add_argument("--input_file", type=str,
145
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json",
146
+ help="Path to input JSON with subclaims")
147
+
148
+ parser.add_argument("--save_folder", type=str,
149
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3",
150
+ help="Folder to save results")
151
+
152
+ # Range arguments
153
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
154
+ parser.add_argument("--end_index", type=int, default=-1, help="End index (exclusive). -1 for all.")
155
+
156
+ args = parser.parse_args()
157
+
158
+ INPUT_FILE = args.input_file
159
+ SAVE_FOLDER = args.save_folder
160
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
161
+
162
+ # -----------------------------
163
+ # Load Data
164
+ # -----------------------------
165
+ print(f"Loading data from {INPUT_FILE}...")
166
+ with open(INPUT_FILE, "r") as f:
167
+ all_data = json.load(f)
168
+
169
+ # -----------------------------
170
+ # Slice Data based on Range
171
+ # -----------------------------
172
+ total_len = len(all_data)
173
+ start = args.start_index
174
+ end = args.end_index if args.end_index != -1 else total_len
175
+
176
+ if end > total_len:
177
+ end = total_len
178
+
179
+ data_slice = all_data[start:end]
180
+
181
+ print(f"Total dataset size: {total_len}")
182
+ print(f"Processing range: {start} to {end}")
183
+ print(f"Items in this batch: {len(data_slice)}")
184
+
185
+ # -----------------------------
186
+ # Output Files
187
+ # -----------------------------
188
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}.json")
189
+ ERROR_LOG_FILE = os.path.join(SAVE_FOLDER, f"error_log_{start}_{end}.json")
190
+
191
+ # -----------------------------
192
+ # Resume Logic
193
+ # -----------------------------
194
+ processed_results = []
195
+ if os.path.exists(OUTPUT_FILE):
196
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
197
+ try:
198
+ with open(OUTPUT_FILE, "r") as f:
199
+ processed_results = json.load(f)
200
+ except:
201
+ processed_results = []
202
+
203
+ processed_ids = {item['id'] for item in processed_results}
204
+ to_process = [item for item in data_slice if item['id'] not in processed_ids]
205
+
206
+ print(f"Already processed in this file: {len(processed_ids)}")
207
+ print(f"Remaining to process: {len(to_process)}")
208
+
209
+ # -----------------------------
210
+ # Initialize Error Tracker
211
+ # -----------------------------
212
+ global_error_log = []
213
+
214
+ # -----------------------------
215
+ # Processing Loop
216
+ # -----------------------------
217
+ # Added tqdm postfix to show error count in real-time
218
+ pbar = tqdm.tqdm(to_process)
219
+
220
+ for item in pbar:
221
+ current_id = item.get('id', 'unknown')
222
+
223
+ # 1. Prepare Texts
224
+ easy_text = item.get("easy_text", "")
225
+ inter_text = item.get("intermediate_text", "")
226
+ hard_text = item.get("hard_text", "")
227
+ fulltext = item.get("fulltext", "")
228
+ summary = item.get("summary", "")
229
+
230
+ # 2. Prepare Subclaim Lists
231
+ def ensure_list(x): return x if isinstance(x, list) else []
232
+
233
+ easy_subs = ensure_list(item.get("easy_subclaims", []))
234
+ inter_subs = ensure_list(item.get("intermediate_subclaims", []))
235
+ hard_subs = ensure_list(item.get("hard_subclaims", []))
236
+ full_subs = ensure_list(item.get("fulltext_subclaims", []))
237
+ summary_subs = ensure_list(item.get("summary_subclaims", []))
238
+
239
+ # ---------------------------------------------------------
240
+ # METRICS CALCULATION (Now passing id and error_log)
241
+ # ---------------------------------------------------------
242
+
243
+ # Attribution: Generated Subclaims -> Full Text
244
+ attr_easy = calculate_metric(easy_subs, fulltext, "attribution", current_id, global_error_log)
245
+ attr_inter = calculate_metric(inter_subs, fulltext, "attribution", current_id, global_error_log)
246
+ attr_hard = calculate_metric(hard_subs, fulltext, "attribution", current_id, global_error_log)
247
+
248
+ # Conciseness: Generated Subclaims -> Summary Text
249
+ conc_easy = calculate_metric(easy_subs, summary, "conciseness", current_id, global_error_log)
250
+ conc_inter = calculate_metric(inter_subs, summary, "conciseness", current_id, global_error_log)
251
+ conc_hard = calculate_metric(hard_subs, summary, "conciseness", current_id, global_error_log)
252
+
253
+ # Completeness: summary Subclaims -> Generated Text
254
+ comp_easy = calculate_metric(summary_subs, easy_text, "completeness", current_id, global_error_log)
255
+ comp_inter = calculate_metric(summary_subs, inter_text, "completeness", current_id, global_error_log)
256
+ comp_hard = calculate_metric(summary_subs, hard_text, "completeness", current_id, global_error_log)
257
+
258
+ # Construct Output
259
+ result_item = item.copy()
260
+ result_item["metrics"] = {
261
+ "easy": {
262
+ "attribution": attr_easy,
263
+ "conciseness": conc_easy,
264
+ "completeness": comp_easy
265
+ },
266
+ "intermediate": {
267
+ "attribution": attr_inter,
268
+ "conciseness": conc_inter,
269
+ "completeness": comp_inter
270
+ },
271
+ "hard": {
272
+ "attribution": attr_hard,
273
+ "conciseness": conc_hard,
274
+ "completeness": comp_hard
275
+ }
276
+ }
277
+
278
+ processed_results.append(result_item)
279
+
280
+ # Update progress bar with error count
281
+ if len(global_error_log) > 0:
282
+ pbar.set_postfix({"Errors": len(global_error_log)})
283
+
284
+ # Save frequently
285
+ if len(processed_results) % 10 == 0:
286
+ with open(OUTPUT_FILE, "w") as f:
287
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
288
+
289
+ # Final Save
290
+ with open(OUTPUT_FILE, "w") as f:
291
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
292
+
293
+ print(f"Evaluation for range {start}:{end} complete. Saved to: {OUTPUT_FILE}")
294
+
295
+ # -----------------------------
296
+ # Error Reporting
297
+ # -----------------------------
298
+ if global_error_log:
299
+ print(f"\n⚠️ WARNING: {len(global_error_log)} API errors occurred during processing.")
300
+ with open(ERROR_LOG_FILE, "w") as f:
301
+ json.dump(global_error_log, f, indent=4)
302
+ print(f"Error details saved to: {ERROR_LOG_FILE}")
303
+ else:
304
+ print("\n✅ Success: No API errors detected.")
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v3.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import re
5
+ from vllm import LLM, SamplingParams
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "Qwen/Qwen3-30B-A3B-Thinking-2507"
11
+
12
+ # -----------------------------
13
+ # PROMPT & CLEANING
14
+ # -----------------------------
15
+ def inference_prompt(text, subclaim):
16
+ return f"""
17
+ You are a precise, conservative medical evidence evaluator.
18
+
19
+ Your task:
20
+ Determine the relationship between the following MEDICAL TEXT and the SUBCLAIM.
21
+
22
+ Use ONLY these labels (lowercase):
23
+ - supported → the TEXT clearly supports the SUBCLAIM.
24
+ - refuted → the TEXT clearly contradicts the SUBCLAIM.
25
+ - not_supported → the TEXT is related to the SUBCLAIM but does NOT provide enough evidence.
26
+
27
+ Important instructions:
28
+ - Analyze the text carefully before deciding.
29
+ - Be STRICT and CONSERVATIVE.
30
+ - If exact numerical details differ or are missing, choose not_supported.
31
+ - Respond with exactly ONE label at the end.
32
+
33
+ Medical Text:
34
+ {text}
35
+
36
+ Subclaim:
37
+ {subclaim}
38
+
39
+ Respond with exactly ONE label:
40
+ supported
41
+ refuted
42
+ not_supported
43
+ """
44
+
45
+ def clean_response(text):
46
+ """
47
+ Removes <think> tags and extracts the final label.
48
+ """
49
+ if not text:
50
+ return "not_supported"
51
+
52
+ # Remove thinking block
53
+ text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
54
+ text = text.strip().lower()
55
+
56
+ # Extract the last valid label found
57
+ valid_labels = ["not_supported", "supported", "refuted"]
58
+
59
+ # Check if the text ends with a valid label (ignoring punctuation)
60
+ for label in valid_labels:
61
+ if label in text:
62
+ return label
63
+
64
+ return "not_supported"
65
+
66
+ # -----------------------------
67
+ # MAIN
68
+ # -----------------------------
69
+ if __name__ == "__main__":
70
+ parser = argparse.ArgumentParser()
71
+ parser.add_argument("--input_file", type=str,
72
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json")
73
+ parser.add_argument("--save_folder", type=str,
74
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_cal_v4")
75
+ parser.add_argument("--start_index", type=int, default=0)
76
+ parser.add_argument("--end_index", type=int, default=-1)
77
+
78
+ # vLLM Performance Arguments
79
+ parser.add_argument("--gpu_utilization", type=float, default=0.95)
80
+ parser.add_argument("--max_model_len", type=int, default=16384) # Adjusted for A100 80GB
81
+
82
+ args = parser.parse_args()
83
+
84
+ # 1. Setup Data
85
+ INPUT_FILE = args.input_file
86
+ SAVE_FOLDER = args.save_folder
87
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
88
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{args.start_index}_{args.end_index}.json")
89
+
90
+ print(f"Loading data from {INPUT_FILE}...")
91
+ with open(INPUT_FILE, "r") as f:
92
+ all_data = json.load(f)
93
+
94
+ # Slice Data
95
+ total_len = len(all_data)
96
+ start = args.start_index
97
+ end = args.end_index if args.end_index != -1 else total_len
98
+ data_slice = all_data[start:end]
99
+ print(f"Processing range: {start} to {end} ({len(data_slice)} items)")
100
+
101
+ # -----------------------------
102
+ # PHASE 1: PREPARE PROMPTS
103
+ # -----------------------------
104
+ print("Building prompt list...")
105
+
106
+ # We need to flatten the hierarchy to feed vLLM a single list of strings
107
+ # We will store metadata to reconstruct the structure later
108
+ prompts_list = []
109
+ request_metadata = [] # Syncs index-to-index with prompts_list
110
+
111
+ def add_request(item_id, text, subclaims, metric_type, level):
112
+ if not subclaims or not isinstance(subclaims, list):
113
+ return
114
+ for sub in subclaims:
115
+ p = inference_prompt(text, sub)
116
+ prompts_list.append(p)
117
+ request_metadata.append({
118
+ "id": item_id,
119
+ "metric_type": metric_type, # 'attribution', 'conciseness', 'completeness'
120
+ "level": level, # 'easy', 'intermediate', 'hard'
121
+ "subclaim": sub
122
+ })
123
+
124
+ for item in data_slice:
125
+ itm_id = item.get('id')
126
+ fulltext = item.get("fulltext", "")
127
+ summary = item.get("summary", "")
128
+
129
+ easy_txt = item.get("easy_text", "")
130
+ inter_txt = item.get("intermediate_text", "")
131
+ hard_txt = item.get("hard_text", "")
132
+
133
+ # A. ATTRIBUTION (Subclaims -> Fulltext)
134
+ add_request(itm_id, fulltext, item.get("easy_subclaims", []), "attribution", "easy")
135
+ add_request(itm_id, fulltext, item.get("intermediate_subclaims", []), "attribution", "intermediate")
136
+ add_request(itm_id, fulltext, item.get("hard_subclaims", []), "attribution", "hard")
137
+
138
+ # B. CONCISENESS (Subclaims -> Summary)
139
+ add_request(itm_id, summary, item.get("easy_subclaims", []), "conciseness", "easy")
140
+ add_request(itm_id, summary, item.get("intermediate_subclaims", []), "conciseness", "intermediate")
141
+ add_request(itm_id, summary, item.get("hard_subclaims", []), "conciseness", "hard")
142
+
143
+ # C. COMPLETENESS (Summary Subclaims -> Generated Text)
144
+ sum_subs = item.get("summary_subclaims", [])
145
+ add_request(itm_id, easy_txt, sum_subs, "completeness", "easy")
146
+ add_request(itm_id, inter_txt, sum_subs, "completeness", "intermediate")
147
+ add_request(itm_id, hard_txt, sum_subs, "completeness", "hard")
148
+
149
+ print(f"Total inference requests generated: {len(prompts_list)}")
150
+
151
+ if len(prompts_list) == 0:
152
+ print("No subclaims found to process.")
153
+ exit()
154
+
155
+ # -----------------------------
156
+ # PHASE 2: BATCH INFERENCE
157
+ # -----------------------------
158
+ print("Initializing vLLM Engine...")
159
+ llm = LLM(
160
+ model=MODEL_PATH,
161
+ trust_remote_code=True,
162
+ dtype="bfloat16",
163
+ gpu_memory_utilization=args.gpu_utilization,
164
+ max_model_len=args.max_model_len,
165
+ enforce_eager=True # Helps with Qwen MoE stability
166
+ )
167
+
168
+ # Allow max_tokens for "Thinking", but we only keep the label later
169
+ sampling_params = SamplingParams(temperature=0, max_tokens=1024)
170
+
171
+ print("Running Inference...")
172
+ outputs = llm.generate(prompts_list, sampling_params)
173
+
174
+ # -----------------------------
175
+ # PHASE 3: AGGREGATE RESULTS
176
+ # -----------------------------
177
+ print("Aggregating results...")
178
+
179
+ # Dictionary to reconstruct the data: results_map[id][metric][level] = list of results
180
+ results_map = {}
181
+
182
+ for i, output in enumerate(outputs):
183
+ meta = request_metadata[i]
184
+ generated_text = output.outputs[0].text
185
+
186
+ # Clean the Qwen "Thinking" output
187
+ label = clean_response(generated_text)
188
+
189
+ item_id = meta['id']
190
+ metric = meta['metric_type']
191
+ level = meta['level']
192
+
193
+ if item_id not in results_map:
194
+ results_map[item_id] = {
195
+ "attribution": {"easy": [], "intermediate": [], "hard": []},
196
+ "conciseness": {"easy": [], "intermediate": [], "hard": []},
197
+ "completeness": {"easy": [], "intermediate": [], "hard": []},
198
+ }
199
+
200
+ results_map[item_id][metric][level].append({
201
+ "subclaim": meta['subclaim'],
202
+ "label": label
203
+ })
204
+
205
+ # -----------------------------
206
+ # PHASE 4: CALCULATE SCORES & SAVE
207
+ # -----------------------------
208
+ final_output = []
209
+
210
+ for original_item in data_slice:
211
+ itm_id = original_item.get('id')
212
+
213
+ # Create a clean copy of the item
214
+ new_item = original_item.copy()
215
+
216
+ # Structure for metrics
217
+ metrics_struct = {
218
+ "easy": {}, "intermediate": {}, "hard": {}
219
+ }
220
+
221
+ # If we processed this item (it had subclaims)
222
+ if itm_id in results_map:
223
+ raw_data = results_map[itm_id]
224
+
225
+ # Iterate levels (easy, intermediate, hard)
226
+ for level in ["easy", "intermediate", "hard"]:
227
+ # Iterate metrics (attribution, conciseness, completeness)
228
+ for metric in ["attribution", "conciseness", "completeness"]:
229
+
230
+ subclaim_results = raw_data[metric][level]
231
+ total = len(subclaim_results)
232
+ supported = sum(1 for x in subclaim_results if x['label'] == 'supported')
233
+ score = (supported / total) if total > 0 else 0.0
234
+
235
+ metrics_struct[level][metric] = {
236
+ "score": score,
237
+ "details": subclaim_results
238
+ }
239
+ else:
240
+ # Handle empty items
241
+ empty_res = {"score": 0.0, "details": []}
242
+ for level in ["easy", "intermediate", "hard"]:
243
+ metrics_struct[level] = {
244
+ "attribution": empty_res,
245
+ "conciseness": empty_res,
246
+ "completeness": empty_res
247
+ }
248
+
249
+ new_item["metrics"] = metrics_struct
250
+ final_output.append(new_item)
251
+
252
+ print(f"Saving {len(final_output)} items to {OUTPUT_FILE}...")
253
+ with open(OUTPUT_FILE, "w") as f:
254
+ json.dump(final_output, f, indent=4, ensure_ascii=False)
255
+
256
+ print("Done.")
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v4.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
11
+ API_URL = "http://172.16.34.29:8004/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # VERIFICATION PROMPT
18
+ # -----------------------------
19
+ def inference_prompt(text, subclaim):
20
+ return f"""
21
+ You are a precise, conservative medical evidence evaluator.
22
+
23
+ Your task:
24
+ Determine the relationship between the following MEDICAL TEXT and the SUBCLAIM.
25
+
26
+ Use ONLY these labels (lowercase):
27
+ - supported → the TEXT clearly supports the SUBCLAIM. The information is
28
+ explicitly stated or follows from a very direct and
29
+ unambiguous medical inference (e.g., “fiebre de 39°C”
30
+ supports “tenía fiebre”).
31
+ - refuted → the TEXT clearly contradicts the SUBCLAIM (e.g., the TEXT
32
+ states the opposite, or provides mutually exclusive values:
33
+ different drug, dose, duration, time point, diagnosis, etc.).
34
+ - not_supported → the TEXT is related to the SUBCLAIM but does NOT provide
35
+ enough evidence to mark it as supported or refuted
36
+ (e.g., missing or different dose, duration, timing,
37
+ route, frequency, or diagnosis; or the claim simply
38
+ is not mentioned).
39
+
40
+ Important instructions:
41
+ - Be STRICT and CONSERVATIVE:
42
+ - If exact numerical details (dose, time, duration, frequency, age, etc.)
43
+ in the SUBCLAIM are not explicitly stated or clearly implied in the TEXT,
44
+ choose not_supported.
45
+ - Do NOT assume or infer information beyond what is clearly supported by
46
+ the TEXT, even if it seems medically plausible.
47
+ - Use refuted ONLY when there is a clear contradiction between TEXT and
48
+ SUBCLAIM.
49
+ - Ignore your external medical knowledge; base your decision ONLY on the TEXT.
50
+ - The TEXT and SUBCLAIM may be in Spanish; evaluate them as written.
51
+ - Do NOT add any explanation, justification, or extra text.
52
+
53
+ Medical Text:
54
+ {text}
55
+
56
+ Subclaim:
57
+ {subclaim}
58
+
59
+ Respond with exactly ONE label:
60
+ supported
61
+ refuted
62
+ not_supported
63
+ """
64
+
65
+ # -----------------------------
66
+ # VERIFICATION LOGIC
67
+ # -----------------------------
68
+ def check_support(text: str, subclaim: str, item_id=None, error_log=None) -> str:
69
+ """
70
+ Returns: 'supported', 'refuted', or 'not_supported'
71
+ Tracks errors in error_log if provided.
72
+ """
73
+ if not text or not subclaim:
74
+ return "not_supported"
75
+
76
+ prompt = inference_prompt(text, subclaim)
77
+
78
+ try:
79
+ response = client.chat.completions.create(
80
+ model=MODEL_PATH,
81
+ messages=[{"role": "user", "content": prompt}],
82
+ max_tokens=512,
83
+ temperature=0.1,
84
+ )
85
+ res = response.choices[0].message.content
86
+ if "</think>" in res:
87
+ res = res.split("</think>")[1].strip().lower()
88
+ else:
89
+ res = response.choices[0].message.content.strip().lower()
90
+
91
+ if "not_supported" in res:
92
+ return "not_supported"
93
+ elif "supported" in res:
94
+ return "supported"
95
+ elif "refuted" in res:
96
+ return "refuted"
97
+ else:
98
+ return "not_supported"
99
+
100
+ except Exception as e:
101
+ # --- ERROR TRACKING ---
102
+ if error_log is not None:
103
+ error_details = {
104
+ "id": item_id,
105
+ "subclaim": subclaim,
106
+ "error_msg": str(e),
107
+ "type": "API_ERROR"
108
+ }
109
+ error_log.append(error_details)
110
+ # ----------------------
111
+
112
+ # Optional: Print to console so you see it happening live
113
+ print(f"\n[!] Error on ID {item_id}: {e}")
114
+ return "not_supported"
115
+
116
+ def calculate_metric(subclaims_list: list, reference_text: str, metric_name: str, item_id=None, error_log=None):
117
+ if not subclaims_list:
118
+ return {"score": 0.0, "details": []}
119
+
120
+ results = []
121
+ supported_count = 0
122
+
123
+ for subclaim in subclaims_list:
124
+ # Pass tracking info down to check_support
125
+ label = check_support(reference_text, subclaim, item_id=item_id, error_log=error_log)
126
+
127
+ is_supported = (label == "supported")
128
+
129
+ if is_supported:
130
+ supported_count += 1
131
+
132
+ results.append({
133
+ "subclaim": subclaim,
134
+ "label": label
135
+ })
136
+
137
+ score = supported_count / len(subclaims_list) if len(subclaims_list) > 0 else 0.0
138
+
139
+ return {
140
+ "score": score,
141
+ "details": results
142
+ }
143
+
144
+ # -----------------------------
145
+ # MAIN
146
+ # -----------------------------
147
+ if __name__ == "__main__":
148
+ parser = argparse.ArgumentParser()
149
+ parser.add_argument("--input_file", type=str,
150
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json",
151
+ help="Path to input JSON with subclaims")
152
+
153
+ parser.add_argument("--save_folder", type=str,
154
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_cal_v4",
155
+ help="Folder to save results")
156
+
157
+ # Range arguments
158
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
159
+ parser.add_argument("--end_index", type=int, default=6, help="End index (exclusive). -1 for all.")
160
+
161
+ args = parser.parse_args()
162
+
163
+ INPUT_FILE = args.input_file
164
+ SAVE_FOLDER = args.save_folder
165
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
166
+
167
+ # -----------------------------
168
+ # Load Data
169
+ # -----------------------------
170
+ print(f"Loading data from {INPUT_FILE}...")
171
+ with open(INPUT_FILE, "r") as f:
172
+ all_data = json.load(f)
173
+
174
+ # -----------------------------
175
+ # Slice Data based on Range
176
+ # -----------------------------
177
+ total_len = len(all_data)
178
+ start = args.start_index
179
+ end = args.end_index if args.end_index != -1 else total_len
180
+
181
+ if end > total_len:
182
+ end = total_len
183
+
184
+ data_slice = all_data[start:end]
185
+
186
+ print(f"Total dataset size: {total_len}")
187
+ print(f"Processing range: {start} to {end}")
188
+ print(f"Items in this batch: {len(data_slice)}")
189
+
190
+ # -----------------------------
191
+ # Output Files
192
+ # -----------------------------
193
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}.json")
194
+ ERROR_LOG_FILE = os.path.join(SAVE_FOLDER, f"error_log_{start}_{end}.json")
195
+
196
+ # -----------------------------
197
+ # Resume Logic
198
+ # -----------------------------
199
+ processed_results = []
200
+ if os.path.exists(OUTPUT_FILE):
201
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
202
+ try:
203
+ with open(OUTPUT_FILE, "r") as f:
204
+ processed_results = json.load(f)
205
+ except:
206
+ processed_results = []
207
+
208
+ processed_ids = {item['id'] for item in processed_results}
209
+ to_process = [item for item in data_slice if item['id'] not in processed_ids]
210
+
211
+ print(f"Already processed in this file: {len(processed_ids)}")
212
+ print(f"Remaining to process: {len(to_process)}")
213
+
214
+ # -----------------------------
215
+ # Initialize Error Tracker
216
+ # -----------------------------
217
+ global_error_log = []
218
+
219
+ # -----------------------------
220
+ # Processing Loop
221
+ # -----------------------------
222
+ # Added tqdm postfix to show error count in real-time
223
+ pbar = tqdm.tqdm(to_process)
224
+
225
+ for item in pbar:
226
+ current_id = item.get('id', 'unknown')
227
+
228
+ # 1. Prepare Texts
229
+ easy_text = item.get("easy_text", "")
230
+ inter_text = item.get("intermediate_text", "")
231
+ hard_text = item.get("hard_text", "")
232
+ fulltext = item.get("fulltext", "")
233
+ summary = item.get("summary", "")
234
+
235
+ # 2. Prepare Subclaim Lists
236
+ def ensure_list(x): return x if isinstance(x, list) else []
237
+
238
+ easy_subs = ensure_list(item.get("easy_subclaims", []))
239
+ inter_subs = ensure_list(item.get("intermediate_subclaims", []))
240
+ hard_subs = ensure_list(item.get("hard_subclaims", []))
241
+ full_subs = ensure_list(item.get("fulltext_subclaims", []))
242
+ summary_subs = ensure_list(item.get("summary_subclaims", []))
243
+
244
+ # ---------------------------------------------------------
245
+ # METRICS CALCULATION (Now passing id and error_log)
246
+ # ---------------------------------------------------------
247
+
248
+ # Attribution: Generated Subclaims -> Full Text
249
+ attr_easy = calculate_metric(easy_subs, fulltext, "attribution", current_id, global_error_log)
250
+ attr_inter = calculate_metric(inter_subs, fulltext, "attribution", current_id, global_error_log)
251
+ attr_hard = calculate_metric(hard_subs, fulltext, "attribution", current_id, global_error_log)
252
+
253
+ # Conciseness: Generated Subclaims -> Summary Text
254
+ conc_easy = calculate_metric(easy_subs, summary, "conciseness", current_id, global_error_log)
255
+ conc_inter = calculate_metric(inter_subs, summary, "conciseness", current_id, global_error_log)
256
+ conc_hard = calculate_metric(hard_subs, summary, "conciseness", current_id, global_error_log)
257
+
258
+ # Completeness: summary Subclaims -> Generated Text
259
+ comp_easy = calculate_metric(summary_subs, easy_text, "completeness", current_id, global_error_log)
260
+ comp_inter = calculate_metric(summary_subs, inter_text, "completeness", current_id, global_error_log)
261
+ comp_hard = calculate_metric(summary_subs, hard_text, "completeness", current_id, global_error_log)
262
+
263
+ # Construct Output
264
+ result_item = item.copy()
265
+ result_item["metrics"] = {
266
+ "easy": {
267
+ "attribution": attr_easy,
268
+ "conciseness": conc_easy,
269
+ "completeness": comp_easy
270
+ },
271
+ "intermediate": {
272
+ "attribution": attr_inter,
273
+ "conciseness": conc_inter,
274
+ "completeness": comp_inter
275
+ },
276
+ "hard": {
277
+ "attribution": attr_hard,
278
+ "conciseness": conc_hard,
279
+ "completeness": comp_hard
280
+ }
281
+ }
282
+
283
+ processed_results.append(result_item)
284
+
285
+ # Update progress bar with error count
286
+ if len(global_error_log) > 0:
287
+ pbar.set_postfix({"Errors": len(global_error_log)})
288
+
289
+ # Save frequently
290
+ if len(processed_results) % 10 == 0:
291
+ with open(OUTPUT_FILE, "w") as f:
292
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
293
+
294
+ # Final Save
295
+ with open(OUTPUT_FILE, "w") as f:
296
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
297
+
298
+ print(f"Evaluation for range {start}:{end} complete. Saved to: {OUTPUT_FILE}")
299
+
300
+ # -----------------------------
301
+ # Error Reporting
302
+ # -----------------------------
303
+ if global_error_log:
304
+ print(f"\n⚠️ WARNING: {len(global_error_log)} API errors occurred during processing.")
305
+ with open(ERROR_LOG_FILE, "w") as f:
306
+ json.dump(global_error_log, f, indent=4)
307
+ print(f"Error details saved to: {ERROR_LOG_FILE}")
308
+ else:
309
+ print("\n✅ Success: No API errors detected.")
code/finetune-inference/subclaim_support_extraction/old/subclaim_support_cal_v5.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/Mistral-Small-3.1-24B_subclaims-support-check-8b_ctx_v2-bf16"
11
+ API_URL = "http://172.16.34.29:8004/v1"
12
+ API_KEY = "EMPTY"
13
+
14
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
15
+
16
+ # -----------------------------
17
+ # VERIFICATION PROMPT
18
+ # -----------------------------
19
+ def inference_prompt(text, subclaim):
20
+ return f"""You are a clinical evidence auditor. Your evaluation must be based STRICTLY and ONLY on the provided medical text.
21
+
22
+ ### MANDATORY GROUNDING RULES:
23
+ 1. NO OUTSIDE KNOWLEDGE: Do not use your internal medical knowledge. Even if a subclaim is "common sense" in medicine, if it is not explicitly in the TEXT, it is 'not_supported'.
24
+ 2. NO LOGICAL LEAPS: Do not bridge gaps in logic. (e.g., If the text mentions "high blood sugar" but not the word "diabetes", you cannot support a claim of "diabetes").
25
+ 3. EXACT NUMERICAL MATCHING: Any doses (e.g., 500mg), frequencies (e.g., twice daily), or durations (e.g., 10 days) mentioned in the subclaim must match the text perfectly. If they are missing or different in the text, label as 'not_supported'.
26
+ 4. DEFAULT TO NOT SUPPORTED: If the text is vague, ambiguous, or only suggests a possibility, you MUST choose 'not_supported'.
27
+ 5. CLOSED-WORLD REALITY: Treat the TEXT as the only information that exists in the world.
28
+
29
+ ### Medical Text:
30
+ {text}
31
+
32
+ ### Subclaim:
33
+ {subclaim}
34
+
35
+ Output exactly one word ('supported' or 'not_supported') based on the strict rules above:"""
36
+
37
+ # -----------------------------
38
+ # VERIFICATION LOGIC
39
+ # -----------------------------
40
+ def check_support(text: str, subclaim: str, item_id=None, error_log=None) -> str:
41
+ """
42
+ Returns: 'supported', 'refuted', or 'not_supported'
43
+ Tracks errors in error_log if provided.
44
+ """
45
+ if not text or not subclaim:
46
+ return "not_supported"
47
+
48
+ prompt = inference_prompt(text, subclaim)
49
+
50
+ try:
51
+ response = client.chat.completions.create(
52
+ model=MODEL_PATH,
53
+ messages=[{"role": "user", "content": prompt}],
54
+ max_tokens=512,
55
+ temperature=0.1,
56
+ )
57
+ res = response.choices[0].message.content
58
+ if "</think>" in res:
59
+ res = res.split("</think>")[1].strip().lower()
60
+ else:
61
+ res = response.choices[0].message.content.strip().lower()
62
+
63
+ if "not_supported" in res:
64
+ return "not_supported"
65
+ elif "supported" in res:
66
+ return "supported"
67
+ elif "refuted" in res:
68
+ return "refuted"
69
+ else:
70
+ return "not_supported"
71
+
72
+ except Exception as e:
73
+ # --- ERROR TRACKING ---
74
+ if error_log is not None:
75
+ error_details = {
76
+ "id": item_id,
77
+ "subclaim": subclaim,
78
+ "error_msg": str(e),
79
+ "type": "API_ERROR"
80
+ }
81
+ error_log.append(error_details)
82
+ # ----------------------
83
+
84
+ # Optional: Print to console so you see it happening live
85
+ print(f"\n[!] Error on ID {item_id}: {e}")
86
+ return "not_supported"
87
+
88
+ def calculate_metric(subclaims_list: list, reference_text: str, metric_name: str, item_id=None, error_log=None):
89
+ if not subclaims_list:
90
+ return {"score": 0.0, "details": []}
91
+
92
+ results = []
93
+ supported_count = 0
94
+
95
+ for subclaim in subclaims_list:
96
+ # Pass tracking info down to check_support
97
+ label = check_support(reference_text, subclaim, item_id=item_id, error_log=error_log)
98
+
99
+ is_supported = (label == "supported")
100
+
101
+ if is_supported:
102
+ supported_count += 1
103
+
104
+ results.append({
105
+ "subclaim": subclaim,
106
+ "label": label
107
+ })
108
+
109
+ score = supported_count / len(subclaims_list) if len(subclaims_list) > 0 else 0.0
110
+
111
+ return {
112
+ "score": score,
113
+ "details": results
114
+ }
115
+
116
+ # -----------------------------
117
+ # MAIN
118
+ # -----------------------------
119
+ if __name__ == "__main__":
120
+ parser = argparse.ArgumentParser()
121
+ parser.add_argument("--input_file", type=str,
122
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json",
123
+ help="Path to input JSON with subclaims")
124
+
125
+ parser.add_argument("--save_folder", type=str,
126
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_testing",
127
+ help="Folder to save results")
128
+
129
+ # Range arguments
130
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
131
+ parser.add_argument("--end_index", type=int, default=6, help="End index (exclusive). -1 for all.")
132
+
133
+ args = parser.parse_args()
134
+
135
+ INPUT_FILE = args.input_file
136
+ SAVE_FOLDER = args.save_folder
137
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
138
+
139
+ # -----------------------------
140
+ # Load Data
141
+ # -----------------------------
142
+ print(f"Loading data from {INPUT_FILE}...")
143
+ with open(INPUT_FILE, "r") as f:
144
+ all_data = json.load(f)
145
+
146
+ # -----------------------------
147
+ # Slice Data based on Range
148
+ # -----------------------------
149
+ total_len = len(all_data)
150
+ start = args.start_index
151
+ end = args.end_index if args.end_index != -1 else total_len
152
+
153
+ if end > total_len:
154
+ end = total_len
155
+
156
+ data_slice = all_data[start:end]
157
+
158
+ print(f"Total dataset size: {total_len}")
159
+ print(f"Processing range: {start} to {end}")
160
+ print(f"Items in this batch: {len(data_slice)}")
161
+
162
+ # -----------------------------
163
+ # Output Files
164
+ # -----------------------------
165
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}_mistral31_24B_v2.json")
166
+ ERROR_LOG_FILE = os.path.join(SAVE_FOLDER, f"error_log_{start}_{end}_mistral31_24B_v2.json")
167
+
168
+ # -----------------------------
169
+ # Resume Logic
170
+ # -----------------------------
171
+ processed_results = []
172
+ if os.path.exists(OUTPUT_FILE):
173
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
174
+ try:
175
+ with open(OUTPUT_FILE, "r") as f:
176
+ processed_results = json.load(f)
177
+ except:
178
+ processed_results = []
179
+
180
+ processed_ids = {item['id'] for item in processed_results}
181
+ to_process = [item for item in data_slice if item['id'] not in processed_ids]
182
+
183
+ print(f"Already processed in this file: {len(processed_ids)}")
184
+ print(f"Remaining to process: {len(to_process)}")
185
+
186
+ # -----------------------------
187
+ # Initialize Error Tracker
188
+ # -----------------------------
189
+ global_error_log = []
190
+
191
+ # -----------------------------
192
+ # Processing Loop
193
+ # -----------------------------
194
+ # Added tqdm postfix to show error count in real-time
195
+ pbar = tqdm.tqdm(to_process)
196
+
197
+ for item in pbar:
198
+ current_id = item.get('id', 'unknown')
199
+
200
+ # 1. Prepare Texts
201
+ easy_text = item.get("easy_text", "")
202
+ inter_text = item.get("intermediate_text", "")
203
+ hard_text = item.get("hard_text", "")
204
+ fulltext = item.get("fulltext", "")
205
+ summary = item.get("summary", "")
206
+
207
+ # 2. Prepare Subclaim Lists
208
+ def ensure_list(x): return x if isinstance(x, list) else []
209
+
210
+ easy_subs = ensure_list(item.get("easy_subclaims", []))
211
+ inter_subs = ensure_list(item.get("intermediate_subclaims", []))
212
+ hard_subs = ensure_list(item.get("hard_subclaims", []))
213
+ full_subs = ensure_list(item.get("fulltext_subclaims", []))
214
+ summary_subs = ensure_list(item.get("summary_subclaims", []))
215
+
216
+ # ---------------------------------------------------------
217
+ # METRICS CALCULATION (Now passing id and error_log)
218
+ # ---------------------------------------------------------
219
+
220
+ # Attribution: Generated Subclaims -> Full Text
221
+ attr_easy = calculate_metric(easy_subs, fulltext, "attribution", current_id, global_error_log)
222
+ attr_inter = calculate_metric(inter_subs, fulltext, "attribution", current_id, global_error_log)
223
+ attr_hard = calculate_metric(hard_subs, fulltext, "attribution", current_id, global_error_log)
224
+
225
+ # Conciseness: Generated Subclaims -> Summary Text
226
+ conc_easy = calculate_metric(easy_subs, summary, "conciseness", current_id, global_error_log)
227
+ conc_inter = calculate_metric(inter_subs, summary, "conciseness", current_id, global_error_log)
228
+ conc_hard = calculate_metric(hard_subs, summary, "conciseness", current_id, global_error_log)
229
+
230
+ # Completeness: summary Subclaims -> Generated Text
231
+ comp_easy = calculate_metric(summary_subs, easy_text, "completeness", current_id, global_error_log)
232
+ comp_inter = calculate_metric(summary_subs, inter_text, "completeness", current_id, global_error_log)
233
+ comp_hard = calculate_metric(summary_subs, hard_text, "completeness", current_id, global_error_log)
234
+
235
+ # Construct Output
236
+ result_item = item.copy()
237
+ result_item["metrics"] = {
238
+ "easy": {
239
+ "attribution": attr_easy,
240
+ "conciseness": conc_easy,
241
+ "completeness": comp_easy
242
+ },
243
+ "intermediate": {
244
+ "attribution": attr_inter,
245
+ "conciseness": conc_inter,
246
+ "completeness": comp_inter
247
+ },
248
+ "hard": {
249
+ "attribution": attr_hard,
250
+ "conciseness": conc_hard,
251
+ "completeness": comp_hard
252
+ }
253
+ }
254
+
255
+ processed_results.append(result_item)
256
+
257
+ # Update progress bar with error count
258
+ if len(global_error_log) > 0:
259
+ pbar.set_postfix({"Errors": len(global_error_log)})
260
+
261
+ # Save frequently
262
+ if len(processed_results) % 10 == 0:
263
+ with open(OUTPUT_FILE, "w") as f:
264
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
265
+
266
+ # Final Save
267
+ with open(OUTPUT_FILE, "w") as f:
268
+ json.dump(processed_results, f, indent=4, ensure_ascii=False)
269
+
270
+ print(f"Evaluation for range {start}:{end} complete. Saved to: {OUTPUT_FILE}")
271
+
272
+ # -----------------------------
273
+ # Error Reporting
274
+ # -----------------------------
275
+ if global_error_log:
276
+ print(f"\n⚠️ WARNING: {len(global_error_log)} API errors occurred during processing.")
277
+ with open(ERROR_LOG_FILE, "w") as f:
278
+ json.dump(global_error_log, f, indent=4)
279
+ print(f"Error details saved to: {ERROR_LOG_FILE}")
280
+ else:
281
+ print("\n✅ Success: No API errors detected.")
code/finetune-inference/subclaim_support_extraction/readctrl_model.code-workspace ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": "../../../../readctrl_model"
5
+ },
6
+ {
7
+ "path": "../../.."
8
+ }
9
+ ],
10
+ "settings": {
11
+ "folder-color.pathColors": [
12
+ {
13
+ "folderPath": "/home/mshahidul/readctrl/data/thresold_finding/",
14
+ "badge": "🥶"
15
+ }
16
+ ]
17
+ }
18
+ }
code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
11
+ model_name="qwen3-32B"
12
+ API_URL = "http://172.16.34.29:8004/v1"
13
+ API_KEY = "EMPTY"
14
+ print(f"Using model: {MODEL_PATH}")
15
+ print(f"Model name: {model_name}")
16
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
17
+
18
+ # -----------------------------
19
+ # VERIFICATION PROMPT
20
+ # -----------------------------
21
+ def inference_prompt(text, subclaim):
22
+ return f"""You are a clinical evidence auditor. Your evaluation must be based STRICTLY and ONLY on the provided medical text.
23
+
24
+ ### MANDATORY GROUNDING RULES:
25
+ 1. NO OUTSIDE KNOWLEDGE: Do not use your internal medical knowledge. Even if a subclaim is "common sense" in medicine, if it is not explicitly in the TEXT, it is 'not_supported'.
26
+ 2. NO LOGICAL LEAPS: Do not bridge gaps in logic. (e.g., If the text mentions "high blood sugar" but not the word "diabetes", you cannot support a claim of "diabetes").
27
+ 3. EXACT NUMERICAL MATCHING: Any doses (e.g., 500mg), frequencies (e.g., twice daily), or durations (e.g., 10 days) mentioned in the subclaim must match the text perfectly. If they are missing or different in the text, label as 'not_supported'.
28
+ 4. DEFAULT TO NOT SUPPORTED: If the text is vague, ambiguous, or only suggests a possibility, you MUST choose 'not_supported'.
29
+ 5. CLOSED-WORLD REALITY: Treat the TEXT as the only information that exists in the world.
30
+
31
+ ### Medical Text:
32
+ {text}
33
+
34
+ ### Subclaim:
35
+ {subclaim}
36
+
37
+ Output exactly one word ('supported' or 'not_supported') based on the strict rules above:"""
38
+
39
+ # -----------------------------
40
+ # VERIFICATION LOGIC
41
+ # -----------------------------
42
+ def check_support(text: str, subclaim: str, error_log=None) -> str:
43
+ """
44
+ Returns: 'supported', 'refuted', or 'not_supported'
45
+ Tracks errors in error_log if provided.
46
+ """
47
+ if not text or not subclaim:
48
+ return "not_supported"
49
+
50
+ prompt = inference_prompt(text, subclaim)
51
+
52
+ try:
53
+ response = client.chat.completions.create(
54
+ model=MODEL_PATH,
55
+ messages=[{"role": "user", "content": prompt}],
56
+ max_tokens=100,
57
+ temperature=0.1,
58
+ )
59
+ res = response.choices[0].message.content
60
+ if "</think>" in res:
61
+ res = res.split("</think>")[1].strip().lower()
62
+ else:
63
+ res = response.choices[0].message.content.strip().lower()
64
+
65
+ if "not_supported" in res:
66
+ return "not_supported"
67
+ elif "supported" in res:
68
+ return "supported"
69
+ elif "refuted" in res:
70
+ return "refuted"
71
+ else:
72
+ return "not_supported"
73
+
74
+ except Exception as e:
75
+ # --- ERROR TRACKING ---
76
+ if error_log is not None:
77
+ error_details = {
78
+ "subclaim": subclaim,
79
+ "error_msg": str(e),
80
+ "type": "API_ERROR"
81
+ }
82
+ error_log.append(error_details)
83
+ # ----------------------
84
+
85
+ # Optional: Print to console so you see it happening live
86
+ # print(f"\n[!] Error on ID {item_id}: {e}")
87
+ return "not_supported"
88
+
89
+
90
+
91
+ # -----------------------------
92
+ # MAIN
93
+ # -----------------------------
94
+ if __name__ == "__main__":
95
+ parser = argparse.ArgumentParser()
96
+ parser.add_argument("--input_file", type=str,
97
+ default="/home/mshahidul/readctrl/data/finetuning_data/test_subclaim_support_v2.json",
98
+ help="Path to input JSON with subclaims")
99
+
100
+ parser.add_argument("--save_folder", type=str,
101
+ default="/home/mshahidul/readctrl/data/concise_complete_attr_testing",
102
+ help="Folder to save results")
103
+
104
+ # Range arguments
105
+ parser.add_argument("--start_index", type=int, default=0, help="Start index")
106
+ parser.add_argument("--end_index", type=int, default=-1, help="End index (exclusive). -1 for all.")
107
+
108
+ args = parser.parse_args()
109
+
110
+ INPUT_FILE = args.input_file
111
+ SAVE_FOLDER = args.save_folder
112
+ os.makedirs(SAVE_FOLDER, exist_ok=True)
113
+
114
+ # -----------------------------
115
+ # Load Data
116
+ # -----------------------------
117
+ print(f"Loading data from {INPUT_FILE}...")
118
+ with open(INPUT_FILE, "r") as f:
119
+ all_data = json.load(f)
120
+
121
+ # -----------------------------
122
+ # Slice Data based on Range
123
+ # -----------------------------
124
+ total_len = len(all_data)
125
+ start = args.start_index
126
+ end = args.end_index if args.end_index != -1 else total_len
127
+
128
+ if end > total_len:
129
+ end = total_len
130
+
131
+ data_slice = all_data[start:end]
132
+
133
+ print(f"Total dataset size: {total_len}")
134
+ print(f"Processing range: {start} to {end}")
135
+ print(f"Items in this batch: {len(data_slice)}")
136
+
137
+ # -----------------------------
138
+ # Output Files
139
+ # -----------------------------
140
+ OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"evaluated_metrics_{start}_{end}_{model_name}_v2.json")
141
+
142
+
143
+ # -----------------------------
144
+ # Resume Logic
145
+ # -----------------------------
146
+ processed_results = []
147
+ if os.path.exists(OUTPUT_FILE):
148
+ print(f"Found existing output file: {OUTPUT_FILE}. Resuming...")
149
+ try:
150
+ with open(OUTPUT_FILE, "r") as f:
151
+ processed_results = json.load(f)
152
+ except:
153
+ processed_results = []
154
+
155
+ processed_ids = {item['medical_text'] for item in processed_results}
156
+ to_process = [item for item in data_slice if item['medical_text'] not in processed_ids]
157
+
158
+ print(f"Already processed in this file: {len(processed_ids)}")
159
+ print(f"Remaining to process: {len(to_process)}")
160
+
161
+ # -----------------------------
162
+ # Initialize Error Tracker
163
+ # -----------------------------
164
+ global_error_log = []
165
+
166
+ # -----------------------------
167
+ # Processing Loop
168
+ # -----------------------------
169
+ # Added tqdm postfix to show error count in real-time
170
+ pbar = tqdm.tqdm(to_process)
171
+
172
+ for item in pbar:
173
+ text=item.get('medical_text', '')
174
+ subclaim=item.get('subclaim', [])
175
+ label_gt=item.get('label', 'not_supported')
176
+ correctness=False
177
+ label_gen=check_support(text, subclaim, error_log=global_error_log)
178
+ if "not_supported" in label_gen and "not_supported" in label_gt:
179
+ correctness=True
180
+ elif "supported" in label_gen and "supported" in label_gt:
181
+ correctness=True
182
+ else:
183
+ print(f"Mismatch:\nGT: {label_gt}\nGEN: {label_gen}\nSubclaim: {subclaim}\nText: {text}\n---")
184
+ result_entry={
185
+ "medical_text": text,
186
+ "subclaim": subclaim,
187
+ "label_gt": label_gt,
188
+ "label_gen": label_gen,
189
+ "correctness": correctness
190
+ }
191
+ processed_results.append(result_entry)
192
+ if len(processed_results) % 10 == 0:
193
+ # Save intermediate results
194
+ with open(OUTPUT_FILE, "w") as f:
195
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
196
+
197
+
198
+ with open(OUTPUT_FILE, "w") as f:
199
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing_v2.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ # Updated to reflect your specific project paths
11
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
12
+ model_name = "qwen3-32B"
13
+ API_URL = "http://172.16.34.29:8004/v1"
14
+ API_KEY = "EMPTY"
15
+
16
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
17
+
18
+ # -----------------------------
19
+ # VERIFICATION PROMPT
20
+ # -----------------------------
21
+ def inference_prompt(text, subclaim):
22
+ return f"""You are a clinical evidence auditor. Your evaluation must be based STRICTLY and ONLY on the provided medical text.
23
+
24
+ ### MANDATORY GROUNDING RULES:
25
+ 1. NO OUTSIDE KNOWLEDGE: Do not use your internal medical knowledge.
26
+ 2. NO LOGICAL LEAPS: Do not bridge gaps in logic.
27
+ 3. EXACT NUMERICAL MATCHING: Any doses, frequencies, or durations must match the text perfectly.
28
+ 4. DEFAULT TO NOT SUPPORTED: If the text is vague or ambiguous, you MUST choose 'not_supported'.
29
+ 5. CLOSED-WORLD REALITY: Treat the TEXT as the only information that exists in the world.
30
+
31
+ ### Medical Text:
32
+ {text}
33
+
34
+ ### Subclaim:
35
+ {subclaim}
36
+
37
+ Output exactly one word ('supported' or 'not_supported') based on the strict rules above:"""
38
+
39
+ # -----------------------------
40
+ # VERIFICATION LOGIC
41
+ # -----------------------------
42
+ def check_support(text: str, subclaim: str) -> str:
43
+ if not text or not subclaim:
44
+ return "not_supported"
45
+
46
+ prompt = inference_prompt(text, subclaim)
47
+
48
+ try:
49
+ response = client.chat.completions.create(
50
+ model=MODEL_PATH,
51
+ messages=[{"role": "user", "content": prompt}],
52
+ max_tokens=10, # Shortened as we only need one word
53
+ temperature=0.1,
54
+ )
55
+ res = response.choices[0].message.content.strip().lower()
56
+
57
+ # Handle reasoning models that might include <think> tags
58
+ if "</think>" in res:
59
+ res = res.split("</think>")[-1].strip()
60
+
61
+ if "not_supported" in res:
62
+ return "not_supported"
63
+ elif "supported" in res:
64
+ return "supported"
65
+ return "not_supported"
66
+
67
+ except Exception as e:
68
+ return "error_api"
69
+
70
+ # -----------------------------
71
+ # MAIN
72
+ # -----------------------------
73
+ if __name__ == "__main__":
74
+ parser = argparse.ArgumentParser()
75
+ parser.add_argument("--input_file", type=str,
76
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_classified_multiclinsum_test_en_en.json")
77
+ parser.add_argument("--save_folder", type=str,
78
+ default="/home/mshahidul/readctrl/data/factual_testing")
79
+ parser.add_argument("--start_index", type=int, default=0)
80
+ parser.add_argument("--end_index", type=int, default=-1)
81
+
82
+ args = parser.parse_args()
83
+ os.makedirs(args.save_folder, exist_ok=True)
84
+
85
+ print(f"Loading data from {args.input_file}...")
86
+ with open(args.input_file, "r") as f:
87
+ all_data = json.load(f)
88
+
89
+ # Slice Data
90
+ total_len = len(all_data)
91
+ start = args.start_index
92
+ end = args.end_index if args.end_index != -1 else total_len
93
+ data_slice = all_data[start:end]
94
+
95
+ OUTPUT_FILE = os.path.join(args.save_folder, f"evaluated_support_{start}_{end}_{model_name}.json")
96
+
97
+ processed_results = []
98
+ # Simple resume logic by checking length
99
+ if os.path.exists(OUTPUT_FILE):
100
+ with open(OUTPUT_FILE, "r") as f:
101
+ processed_results = json.load(f)
102
+ print(f"Resuming from index {len(processed_results)}")
103
+ data_slice = data_slice[len(processed_results):]
104
+
105
+ for item in tqdm.tqdm(data_slice):
106
+ doc_id = item.get('id', 'unknown')
107
+ full_text = item.get('fulltext', '')
108
+ # We usually want to verify if the summary's claims are supported by the full text
109
+ summary_subclaims = item.get('summary_subclaims', [])
110
+
111
+ results_for_this_doc = []
112
+
113
+ # summary_subclaims is likely a list of strings
114
+ for sc in summary_subclaims:
115
+ label_gen = check_support(full_text, sc)
116
+ results_for_this_doc.append({
117
+ "subclaim": sc,
118
+ "support_label": label_gen
119
+ })
120
+
121
+ output_entry = {
122
+ "id": doc_id,
123
+ "fulltext": full_text,
124
+ "summary": item.get('summary', ''),
125
+ "subclaim_evaluations": results_for_this_doc
126
+ }
127
+
128
+ processed_results.append(output_entry)
129
+
130
+ # Periodic save
131
+ if len(processed_results) % 10 == 0:
132
+ with open(OUTPUT_FILE, "w") as f:
133
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
134
+
135
+ # Final save
136
+ with open(OUTPUT_FILE, "w") as f:
137
+ json.dump(processed_results, f, indent=2, ensure_ascii=False)
138
+ print(f"Processing complete. Saved to {OUTPUT_FILE}")
code/finetune-inference/subclaim_support_extraction/subclaim_support_cal_tesing_v3.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tqdm
4
+ import argparse
5
+ from openai import OpenAI
6
+
7
+ # -----------------------------
8
+ # CONFIGURATION
9
+ # -----------------------------
10
+ MODEL_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
11
+ model_name = "qwen3-32B"
12
+ API_URL = "http://172.16.34.29:8004/v1"
13
+ API_KEY = "EMPTY"
14
+
15
+ client = OpenAI(base_url=API_URL, api_key=API_KEY)
16
+
17
+ # -----------------------------
18
+ # PROMPTS
19
+ # -----------------------------
20
+
21
+ def get_attribution_prompt(source_text, subclaim):
22
+ """Checks if summary subclaim is grounded in source."""
23
+ return f"""You are a clinical evidence auditor.
24
+ ### Medical Text (Source):
25
+ {source_text}
26
+ ### Subclaim (from Summary):
27
+ {subclaim}
28
+ Output exactly one word ('supported' or 'not_supported') if the Source text contains the info in the Subclaim:"""
29
+
30
+ def get_completeness_prompt(summary_text, source_subclaim):
31
+ """Checks if a key source fact is present in the summary."""
32
+ return f"""You are checking for information loss in a medical summary.
33
+ ### Summary Text:
34
+ {summary_text}
35
+ ### Key Fact (from Source):
36
+ {source_subclaim}
37
+ Output exactly one word ('supported' or 'not_supported') if the Summary successfully includes the info from the Key Fact:"""
38
+
39
+ # -----------------------------
40
+ # LOGIC
41
+ # -----------------------------
42
+
43
+ def check_support(context: str, subclaim: str, mode="attribution") -> str:
44
+ if not context or not subclaim:
45
+ return "not_supported"
46
+
47
+ if mode == "attribution":
48
+ prompt = get_attribution_prompt(context, subclaim)
49
+ else: # completeness
50
+ prompt = get_completeness_prompt(context, subclaim)
51
+
52
+ try:
53
+ response = client.chat.completions.create(
54
+ model=MODEL_PATH,
55
+ messages=[{"role": "user", "content": prompt}],
56
+ max_tokens=10,
57
+ temperature=0.1,
58
+ )
59
+ res = response.choices[0].message.content.strip().lower()
60
+
61
+ if "</think>" in res:
62
+ res = res.split("</think>")[-1].strip()
63
+
64
+ return "supported" if "supported" in res and "not_supported" not in res else "not_supported"
65
+ except Exception:
66
+ return "error_api"
67
+
68
+ # -----------------------------
69
+ # MAIN
70
+ # -----------------------------
71
+ if __name__ == "__main__":
72
+ parser = argparse.ArgumentParser()
73
+ parser.add_argument("--input_file", type=str,
74
+ default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_classified_multiclinsum_test_en_en.json")
75
+ parser.add_argument("--save_folder", type=str,
76
+ default="/home/mshahidul/readctrl/data/factual_testing")
77
+ parser.add_argument("--start_index", type=int, default=0)
78
+ parser.add_argument("--end_index", type=int, default=-1)
79
+
80
+ args = parser.parse_args()
81
+ os.makedirs(args.save_folder, exist_ok=True)
82
+
83
+ with open(args.input_file, "r") as f:
84
+ all_data = json.load(f)
85
+
86
+ start, end = args.start_index, (args.end_index if args.end_index != -1 else len(all_data))
87
+ data_slice = all_data[start:end]
88
+ OUTPUT_FILE = os.path.join(args.save_folder, f"full_evaluation_{start}_{end}_{model_name}.json")
89
+
90
+ processed_results = []
91
+
92
+ for item in tqdm.tqdm(data_slice):
93
+ full_text = item.get('fulltext', '')
94
+ summary = item.get('summary', '')
95
+
96
+ # 1. Factual Attribution (Summary -> Source)
97
+ summary_subclaims = item.get('summary_subclaims', [])
98
+ attribution_results = []
99
+ for sc in summary_subclaims:
100
+ label = check_support(full_text, sc, mode="attribution")
101
+ attribution_results.append({"subclaim": sc, "label": label})
102
+
103
+ # 2. Completeness Check (Source -> Summary)
104
+ # Assuming you have already extracted subclaims from the fulltext in your JSON
105
+ source_subclaims = item.get('fulltext_subclaims', [])
106
+ completeness_results = []
107
+ for sc in source_subclaims:
108
+ label = check_support(summary, sc, mode="completeness")
109
+ completeness_results.append({"source_fact": sc, "present_in_summary": label})
110
+
111
+ # Calculate scores
112
+ attr_score = sum(1 for x in attribution_results if x['label'] == 'supported') / len(attribution_results) if attribution_results else 0
113
+ comp_score = sum(1 for x in completeness_results if x['present_in_summary'] == 'supported') / len(completeness_results) if completeness_results else 0
114
+
115
+ processed_results.append({
116
+ "id": item.get('id', 'unknown'),
117
+ "scores": {
118
+ "factual_attribution": attr_score,
119
+ "completeness": comp_score
120
+ },
121
+ "attribution_details": attribution_results,
122
+ "completeness_details": completeness_results
123
+ })
124
+
125
+ if len(processed_results) % 5 == 0:
126
+ with open(OUTPUT_FILE, "w") as f:
127
+ json.dump(processed_results, f, indent=2)
128
+
129
+ with open(OUTPUT_FILE, "w") as f:
130
+ json.dump(processed_results, f, indent=2)
131
+ print(f"Done. Saved to {OUTPUT_FILE}")