| import subprocess |
| import json |
| import os |
| import requests |
|
|
| |
| base_url = "https://datasets-server.huggingface.co/rows" |
| dataset_path = "cat-state/mscoco-1st-caption" |
| config = "default" |
| split = "train" |
| offset = 0 |
| length = 100 |
| total_data = 1000 |
| iterations = total_data // length |
|
|
| image_dir = "../images_large" |
| if not os.path.exists(image_dir): |
| os.makedirs(image_dir) |
|
|
| text_data = {} |
|
|
| |
| for i in range(iterations): |
| |
| url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}" |
| |
| |
| result = subprocess.run( |
| ["curl", "-X", "GET", url], |
| capture_output=True, |
| text=True |
| ) |
| |
| output = result.stdout |
| |
| try: |
| data_dict = json.loads(output) |
| except json.JSONDecodeError: |
| print(f"无法将输出转换为字典。输出内容: {output}") |
| continue |
| |
| if 'rows' in data_dict: |
| for item in data_dict['rows']: |
| row_idx = item['row_idx'] |
| row = item['row'] |
| image_url = row.get('url') |
| text = row.get('caption') |
| |
| if image_url: |
| image_filename = f"{image_dir}/{row_idx}_row_image.jpg" |
| response = requests.get(image_url, stream=True) |
| if response.status_code == 200: |
| with open(image_filename, 'wb') as f: |
| for chunk in response.iter_content(chunk_size=8192): |
| f.write(chunk) |
| |
| text_data[f"{row_idx}_row_image"] = text |
| |
| offset += length |
|
|
| |
| json_filename = "../data/row_image_texts_large.json" |
| with open(json_filename, 'w') as f: |
| json.dump(text_data, f, indent=4) |
|
|
| print("图像下载并保存完成,文本信息已保存到 row_image_texts.json") |
|
|
|
|