123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- import numpy as np
- import torch
- import functools
- import os
- import time
- from collections import defaultdict, deque
- __all__ = [
- "AverageMeter",
- "MeterBuffer",
- "get_total_and_free_memory_in_Mb",
- "occupy_mem",
- "gpu_mem_usage",
- ]
- def get_total_and_free_memory_in_Mb(cuda_device):
- devices_info_str = os.popen(
- "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
- )
- devices_info = devices_info_str.read().strip().split("\n")
- total, used = devices_info[int(cuda_device)].split(",")
- return int(total), int(used)
- def occupy_mem(cuda_device, mem_ratio=0.95):
- """
- pre-allocate gpu memory for training to avoid memory Fragmentation.
- """
- total, used = get_total_and_free_memory_in_Mb(cuda_device)
- max_mem = int(total * mem_ratio)
- block_mem = max_mem - used
- x = torch.cuda.FloatTensor(256, 1024, block_mem)
- del x
- time.sleep(5)
- def gpu_mem_usage():
- """
- Compute the GPU memory usage for the current device (MB).
- """
- mem_usage_bytes = torch.cuda.max_memory_allocated()
- return mem_usage_bytes / (1024 * 1024)
- class AverageMeter:
- """Track a series of values and provide access to smoothed values over a
- window or the global series average.
- """
- def __init__(self, window_size=50):
- self._deque = deque(maxlen=window_size)
- self._total = 0.0
- self._count = 0
- def update(self, value):
- self._deque.append(value)
- self._count += 1
- self._total += value
- @property
- def median(self):
- d = np.array(list(self._deque))
- return np.median(d)
- @property
- def avg(self):
- # if deque is empty, nan will be returned.
- d = np.array(list(self._deque))
- return d.mean()
- @property
- def global_avg(self):
- return self._total / max(self._count, 1e-5)
- @property
- def latest(self):
- return self._deque[-1] if len(self._deque) > 0 else None
- @property
- def total(self):
- return self._total
- def reset(self):
- self._deque.clear()
- self._total = 0.0
- self._count = 0
- def clear(self):
- self._deque.clear()
- class MeterBuffer(defaultdict):
- """Computes and stores the average and current value"""
- def __init__(self, window_size=20):
- factory = functools.partial(AverageMeter, window_size=window_size)
- super().__init__(factory)
- def reset(self):
- for v in self.values():
- v.reset()
- def get_filtered_meter(self, filter_key="time"):
- return {k: v for k, v in self.items() if filter_key in k}
- def update(self, values=None, **kwargs):
- if values is None:
- values = {}
- values.update(kwargs)
- for k, v in values.items():
- if isinstance(v, torch.Tensor):
- v = v.detach()
- self[k].update(v)
- def clear_meters(self):
- for v in self.values():
- v.clear()
|