| import os |
| import json |
| from tqdm import tqdm |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
|
|
| |
| data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage3/filtered_video_image_asr_caption_stage3.json' |
| audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data' |
| audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption' |
| video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video' |
| image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA' |
| new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1211.json' |
|
|
| |
| with open(data_json_path, 'r') as f: |
| data = json.load(f) |
|
|
| |
| def file_exists(folder, filename): |
| return os.path.exists(os.path.join(folder, filename)) |
|
|
| |
| file_counts = { |
| "video": {"total": 0, "missing": 0}, |
| "audio_asr": {"total": 0, "missing": 0}, |
| "audio_caption": {"total": 0, "missing": 0}, |
| "image": {"total": 0, "missing": 0}, |
| "unknown": {"total": 0, "missing": 0} |
| } |
|
|
| |
| def process_item(item): |
| result = {"item": item, "valid": True, "missing": []} |
| found = False |
|
|
| if 'video' in item: |
| video_file = item['video'] |
| file_counts["video"]["total"] += 1 |
| found = True |
| if not video_file or not file_exists(video_folder, video_file): |
| result['missing'].append(f"Video file missing or not found: {video_file}") |
| result['valid'] = False |
| file_counts["video"]["missing"] += 1 |
|
|
| if 'audio_asr' in item: |
| audio_asr_file = item['audio_asr'] |
| file_counts["audio_asr"]["total"] += 1 |
| found = True |
| if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file): |
| result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}") |
| result['valid'] = False |
| file_counts["audio_asr"]["missing"] += 1 |
|
|
| if 'audio_caption' in item: |
| audio_caption_file = item['audio_caption'] |
| file_counts["audio_caption"]["total"] += 1 |
| found = True |
| if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file): |
| result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}") |
| result['valid'] = False |
| file_counts["audio_caption"]["missing"] += 1 |
|
|
| if 'image' in item: |
| image_file = item['image'] |
| file_counts["image"]["total"] += 1 |
| found = True |
| if not image_file or not file_exists(image_folder, image_file): |
| result['missing'].append(f"Image file missing or not found: {image_file}") |
| result['valid'] = False |
| file_counts["image"]["missing"] += 1 |
|
|
| if not found: |
| result['valid'] = False |
| file_counts["unknown"]["total"] += 1 |
| file_counts["unknown"]["missing"] += 1 |
|
|
| return result |
|
|
| |
| new_items = [] |
| texts = [] |
|
|
| |
| with ThreadPoolExecutor(max_workers=96) as executor: |
| futures = {executor.submit(process_item, item): item for item in data} |
|
|
| for future in tqdm(as_completed(futures), total=len(futures)): |
| result = future.result() |
| if result['valid']: |
| new_items.append(result['item']) |
| else: |
| texts.append(result['item']) |
| for missing in result['missing']: |
| print(missing) |
|
|
| |
| with open(new_json_path, 'w', encoding='utf-8') as f: |
| json.dump(new_items, f, ensure_ascii=False, indent=4) |
|
|
| |
| print(f"Saved {len(new_items)} valid items to {new_json_path}") |
| print(f"Total and missing files by type:") |
| for file_type, counts in file_counts.items(): |
| print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}") |
|
|
| miss = {'image':[], 'video':[], 'audio_caption':[], 'audio_asr':[]} |
| for text in texts: |
| if 'image' in text: |
| miss['image'].append(text) |
| if 'video' in text: |
| miss['video'].append(text) |
| |
|
|