| from multiprocessing import shared_memory |
| |
| |
| |
| |
| |
| |
|
|
| import random |
| import pickle |
| import time |
| import copy |
| import torch |
| import torch.distributed as dist |
| from lib.cfg_holder import cfg_unique_holder as cfguh |
|
|
| def singleton(class_): |
| instances = {} |
| def getinstance(*args, **kwargs): |
| if class_ not in instances: |
| instances[class_] = class_(*args, **kwargs) |
| return instances[class_] |
| return getinstance |
|
|
| def is_ddp(): |
| return dist.is_available() and dist.is_initialized() |
|
|
| def get_rank(type='local'): |
| ddp = is_ddp() |
| global_rank = dist.get_rank() if ddp else 0 |
| local_world_size = torch.cuda.device_count() |
| if type == 'global': |
| return global_rank |
| elif type == 'local': |
| return global_rank % local_world_size |
| elif type == 'node': |
| return global_rank // local_world_size |
| elif type == 'all': |
| return global_rank, \ |
| global_rank % local_world_size, \ |
| global_rank // local_world_size |
| else: |
| assert False, 'Unknown type' |
|
|
| def get_world_size(type='local'): |
| ddp = is_ddp() |
| global_rank = dist.get_rank() if ddp else 0 |
| global_world_size = dist.get_world_size() if ddp else 1 |
| local_world_size = torch.cuda.device_count() |
| if type == 'global': |
| return global_world_size |
| elif type == 'local': |
| return local_world_size |
| elif type == 'node': |
| return global_world_size // local_world_size |
| elif type == 'all': |
| return global_world_size, local_world_size, \ |
| global_world_size // local_world_size |
| else: |
| assert False, 'Unknown type' |
|
|
| class barrier_lock(object): |
| def __init__(self, n): |
| self.n = n |
| id = int(random.random()*10000) + int(time.time())*10000 |
| self.lock_shmname = 'barrier_lock_{}'.format(id) |
| lock_shm = shared_memory.SharedMemory( |
| name=self.lock_shmname, create=True, size=n) |
| for i in range(n): |
| lock_shm.buf[i] = 0 |
| lock_shm.close() |
|
|
| def destroy(self): |
| try: |
| lock_shm = shared_memory.SharedMemory( |
| name=self.lock_shmname) |
| lock_shm.close() |
| lock_shm.unlink() |
| except: |
| return |
|
|
| def wait(self, k): |
| lock_shm = shared_memory.SharedMemory( |
| name=self.lock_shmname) |
| assert lock_shm.buf[k] == 0, 'Two waits on the same id is not allowed.' |
| lock_shm.buf[k] = 1 |
| if k == 0: |
| while sum([lock_shm.buf[i]==0 for i in range(self.n)]) != 0: |
| pass |
| for i in range(self.n): |
| lock_shm.buf[i] = 0 |
| return |
| else: |
| while lock_shm.buf[k] != 0: |
| pass |
|
|
| class nodewise_sync_global(object): |
| """ |
| This is the global part of nodewise_sync that need to call at master process |
| before spawn. |
| """ |
| def __init__(self): |
| self.local_world_size = get_world_size('local') |
| self.b_lock = barrier_lock(self.local_world_size) |
| id = int(random.random()*10000) + int(time.time())*10000 |
| self.id_shmname = 'nodewise_sync_id_shm_{}'.format(id) |
|
|
| def destroy(self): |
| self.b_lock.destroy() |
| try: |
| shm = shared_memory.SharedMemory(name=self.id_shmname) |
| shm.close() |
| shm.unlink() |
| except: |
| return |
|
|
| @singleton |
| class nodewise_sync(object): |
| """ |
| A class that centralize nodewise sync activities. |
| The backend is multiprocess sharememory, not torch, as torch not support this. |
| """ |
| def __init__(self): |
| pass |
|
|
| def copy_global(self, reference): |
| self.local_world_size = reference.local_world_size |
| self.b_lock = reference.b_lock |
| self.id_shmname = reference.id_shmname |
| return self |
|
|
| def local_init(self): |
| self.ddp = is_ddp() |
| self.global_rank, self.local_rank, self.node_rank = get_rank('all') |
| self.global_world_size, self.local_world_size, self.nodes = get_world_size('all') |
| if self.local_rank == 0: |
| temp = int(random.random()*10000) + int(time.time())*10000 |
| temp = pickle.dumps(temp) |
| shm = shared_memory.SharedMemory( |
| name=self.id_shmname, create=True, size=len(temp)) |
| shm.close() |
| return self |
|
|
| def random_sync_id(self): |
| assert self.local_rank is not None, 'Not initialized!' |
| if self.local_rank == 0: |
| sync_id = int(random.random()*10000) + int(time.time())*10000 |
| data = pickle.dumps(sync_id) |
| shm = shared_memory.SharedMemory(name=self.id_shmname) |
| shm.buf[0:len(data)] = data[0:len(data)] |
| self.barrier() |
| shm.close() |
| else: |
| self.barrier() |
| shm = shared_memory.SharedMemory(name=self.id_shmname) |
| sync_id = pickle.loads(shm.buf) |
| shm.close() |
| return sync_id |
|
|
| def barrier(self): |
| self.b_lock.wait(self.local_rank) |
|
|
| def broadcast_r0(self, data=None): |
| assert self.local_rank is not None, 'Not initialized!' |
| id = self.random_sync_id() |
| shmname = 'broadcast_r0_{}'.format(id) |
| if self.local_rank == 0: |
| assert data!=None, 'Rank 0 needs to input data!' |
| data = pickle.dumps(data) |
| datan = len(data) |
| load_info_shm = shared_memory.SharedMemory( |
| name=shmname, create=True, size=datan) |
| load_info_shm.buf[0:datan] = data[0:datan] |
| self.barrier() |
| self.barrier() |
| load_info_shm.close() |
| load_info_shm.unlink() |
| return None |
| else: |
| assert data==None, 'Rank other than 1 should input None as data!' |
| self.barrier() |
| shm = shared_memory.SharedMemory(name=shmname) |
| data = pickle.loads(shm.buf) |
| shm.close() |
| self.barrier() |
| return data |
|
|
| def destroy(self): |
| self.barrier.destroy() |
| try: |
| shm = shared_memory.SharedMemory(name=self.id_shmname) |
| shm.close() |
| shm.unlink() |
| except: |
| return |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |