metric.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4. import numpy as np
  5. import torch
  6. import functools
  7. import os
  8. import time
  9. from collections import defaultdict, deque
  10. __all__ = [
  11. "AverageMeter",
  12. "MeterBuffer",
  13. "get_total_and_free_memory_in_Mb",
  14. "occupy_mem",
  15. "gpu_mem_usage",
  16. ]
  17. def get_total_and_free_memory_in_Mb(cuda_device):
  18. devices_info_str = os.popen(
  19. "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
  20. )
  21. devices_info = devices_info_str.read().strip().split("\n")
  22. total, used = devices_info[int(cuda_device)].split(",")
  23. return int(total), int(used)
  24. def occupy_mem(cuda_device, mem_ratio=0.95):
  25. """
  26. pre-allocate gpu memory for training to avoid memory Fragmentation.
  27. """
  28. total, used = get_total_and_free_memory_in_Mb(cuda_device)
  29. max_mem = int(total * mem_ratio)
  30. block_mem = max_mem - used
  31. x = torch.cuda.FloatTensor(256, 1024, block_mem)
  32. del x
  33. time.sleep(5)
  34. def gpu_mem_usage():
  35. """
  36. Compute the GPU memory usage for the current device (MB).
  37. """
  38. mem_usage_bytes = torch.cuda.max_memory_allocated()
  39. return mem_usage_bytes / (1024 * 1024)
  40. class AverageMeter:
  41. """Track a series of values and provide access to smoothed values over a
  42. window or the global series average.
  43. """
  44. def __init__(self, window_size=50):
  45. self._deque = deque(maxlen=window_size)
  46. self._total = 0.0
  47. self._count = 0
  48. def update(self, value):
  49. self._deque.append(value)
  50. self._count += 1
  51. self._total += value
  52. @property
  53. def median(self):
  54. d = np.array(list(self._deque))
  55. return np.median(d)
  56. @property
  57. def avg(self):
  58. # if deque is empty, nan will be returned.
  59. d = np.array(list(self._deque))
  60. return d.mean()
  61. @property
  62. def global_avg(self):
  63. return self._total / max(self._count, 1e-5)
  64. @property
  65. def latest(self):
  66. return self._deque[-1] if len(self._deque) > 0 else None
  67. @property
  68. def total(self):
  69. return self._total
  70. def reset(self):
  71. self._deque.clear()
  72. self._total = 0.0
  73. self._count = 0
  74. def clear(self):
  75. self._deque.clear()
  76. class MeterBuffer(defaultdict):
  77. """Computes and stores the average and current value"""
  78. def __init__(self, window_size=20):
  79. factory = functools.partial(AverageMeter, window_size=window_size)
  80. super().__init__(factory)
  81. def reset(self):
  82. for v in self.values():
  83. v.reset()
  84. def get_filtered_meter(self, filter_key="time"):
  85. return {k: v for k, v in self.items() if filter_key in k}
  86. def update(self, values=None, **kwargs):
  87. if values is None:
  88. values = {}
  89. values.update(kwargs)
  90. for k, v in values.items():
  91. if isinstance(v, torch.Tensor):
  92. v = v.detach()
  93. self[k].update(v)
  94. def clear_meters(self):
  95. for v in self.values():
  96. v.clear()