omnis2 / handle_stage3.py

Wangpeng An

Upload folder using huggingface_hub

28b0783 verified over 1 year ago

4.47 kB

	import os
	import json
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor, as_completed


	# Paths
	data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage3/filtered_video_image_asr_caption_stage3.json'
	audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
	audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption'
	video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
	image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
	new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1211.json'

	# Load JSON data
	with open(data_json_path, 'r') as f:
	data = json.load(f)

	# Function to check if a file exists in a folder
	def file_exists(folder, filename):
	return os.path.exists(os.path.join(folder, filename))

	# Initialize counters for missing and total files by type
	file_counts = {
	"video": {"total": 0, "missing": 0},
	"audio_asr": {"total": 0, "missing": 0},
	"audio_caption": {"total": 0, "missing": 0},
	"image": {"total": 0, "missing": 0},
	"unknown": {"total": 0, "missing": 0} # For items missing all types of files
	}

	# Helper function to process each item in the dataset
	def process_item(item):
	result = {"item": item, "valid": True, "missing": []}
	found = False

	if 'video' in item:
	video_file = item['video']
	file_counts["video"]["total"] += 1
	found = True
	if not video_file or not file_exists(video_folder, video_file):
	result['missing'].append(f"Video file missing or not found: {video_file}")
	result['valid'] = False
	file_counts["video"]["missing"] += 1

	if 'audio_asr' in item:
	audio_asr_file = item['audio_asr']
	file_counts["audio_asr"]["total"] += 1
	found = True
	if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
	result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
	result['valid'] = False
	file_counts["audio_asr"]["missing"] += 1

	if 'audio_caption' in item:
	audio_caption_file = item['audio_caption']
	file_counts["audio_caption"]["total"] += 1
	found = True
	if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
	result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
	result['valid'] = False
	file_counts["audio_caption"]["missing"] += 1

	if 'image' in item:
	image_file = item['image']
	file_counts["image"]["total"] += 1
	found = True
	if not image_file or not file_exists(image_folder, image_file):
	result['missing'].append(f"Image file missing or not found: {image_file}")
	result['valid'] = False
	file_counts["image"]["missing"] += 1

	if not found:
	result['valid'] = False
	file_counts["unknown"]["total"] += 1
	file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found

	return result

	# List to store results
	new_items = []
	texts = []

	# Use ThreadPoolExecutor for multithreaded processing
	with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
	futures = {executor.submit(process_item, item): item for item in data}

	for future in tqdm(as_completed(futures), total=len(futures)):
	result = future.result()
	if result['valid']:
	new_items.append(result['item'])
	else:
	texts.append(result['item']) # Collect invalid items if needed
	for missing in result['missing']:
	print(missing)

	# Save new_items to a JSON file
	with open(new_json_path, 'w', encoding='utf-8') as f:
	json.dump(new_items, f, ensure_ascii=False, indent=4)

	# Print the summary of missing and total files by type
	print(f"Saved {len(new_items)} valid items to {new_json_path}")
	print(f"Total and missing files by type:")
	for file_type, counts in file_counts.items():
	print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")

	miss = {'image':[], 'video':[], 'audio_caption':[], 'audio_asr':[]}
	for text in texts:
	if 'image' in text:
	miss['image'].append(text)
	if 'video' in text:
	miss['video'].append(text)