| import os |
| from csv import reader |
| import wget |
| from multiprocessing import Pool, Value |
|
|
| dl_count = Value('i', 0) |
| wl_count = Value('i', 0) |
| data_source = "/SBU/dataset" |
|
|
| |
|
|
| def image_dl(info): |
| global dl_count, wl_count |
| row_count = 1000000 |
| sub_dir = info.split('/')[-2] |
| file_name = info.split('/')[-1] |
| image_dir = os.path.join(data_source, sub_dir) |
| image_path = os.path.join(image_dir, file_name) |
| |
| if os.path.exists(image_dir) and os.path.exists(image_path): |
| with dl_count.get_lock(): |
| dl_count.value += 1 |
| return 1 |
| if not os.path.exists(image_dir): |
| os.mkdir(image_dir) |
| msg1 = "" |
| try: |
| wget.download(info, out=image_path) |
| with dl_count.get_lock(): |
| dl_count.value += 1 |
| except IOError: |
| msg1 = "image {} not found".format(info) |
| with wl_count.get_lock(): |
| wl_count.value += 1 |
| |
| |
| if dl_count.value % 1000 == 0: |
| print("\n") |
| msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value, |
| dl_count.value+wl_count.value, row_count) |
| print(msg2) |
| |
| |
|
|
|
|
| |
| urls = [] |
| with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh: |
| for line in fh: |
| url = line.rstrip() |
| urls.append(url) |
| print("{} imgs to be downloaded".format(len(urls))) |
|
|
| num_processes = 16 |
| pool = Pool(num_processes) |
| pool.map(image_dl, tuple(list(urls))) |