starriver030515
/

diffusion

Model card Files Files and versions

diffusion / utils /download_coco.py

starriver030515's picture

starriver030515

Upload folder using huggingface_hub

a501a0c verified almost 2 years ago

history blame contribute delete

1.99 kB

	import subprocess
	import json
	import os
	import requests

	# 参数配置
	base_url = "https://datasets-server.huggingface.co/rows"
	dataset_path = "cat-state/mscoco-1st-caption"
	config = "default"
	split = "train"
	offset = 0
	length = 100
	total_data = 1000 # 目标获取的数据总量
	iterations = total_data // length # 需要循环的次数

	image_dir = "../images_large"
	if not os.path.exists(image_dir):
	os.makedirs(image_dir)

	text_data = {}

	# 循环多次，以获取全部数据
	for i in range(iterations):
	# 构建请求 URL
	url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}"

	# 获取数据
	result = subprocess.run(
	["curl", "-X", "GET", url],
	capture_output=True,
	text=True
	)

	output = result.stdout

	try:
	data_dict = json.loads(output)
	except json.JSONDecodeError:
	print(f"无法将输出转换为字典。输出内容: {output}")
	continue

	if 'rows' in data_dict:
	for item in data_dict['rows']:
	row_idx = item['row_idx']
	row = item['row']
	image_url = row.get('url')
	text = row.get('caption')

	if image_url:
	image_filename = f"{image_dir}/{row_idx}_row_image.jpg"
	response = requests.get(image_url, stream=True)
	if response.status_code == 200:
	with open(image_filename, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	text_data[f"{row_idx}_row_image"] = text

	offset += length # 更新偏移量以获取下一批数据

	# 保存文本数据
	json_filename = "../data/row_image_texts_large.json"
	with open(json_filename, 'w') as f:
	json.dump(text_data, f, indent=4)

	print("图像下载并保存完成，文本信息已保存到 row_image_texts.json")