nn (神经网络)

神经网络类¤

批量归一化 ¤

BatchNorm(
    sz: int,
    eps=1e-05,
    affine=True,
    track_running_stats=True,
    momentum=0.1,
)

对2D或3D输入应用批量归一化。

描述：https://paperswithcode.com/method/batch-normalization
论文: https://arxiv.org/abs/1502.03167v3

参见：Tensor.batchnorm

norm = nn.BatchNorm(3)
t = Tensor.rand(2, 3, 4, 4)
print(t.mean().item(), t.std().item())

0.4998628497123718 0.2808046340942383

t = norm(t)
print(t.mean().item(), t.std().item())

0.49986034631729126 0.28080320358276367

Source code in tinygrad/nn/__init__.py

def __init__(self, sz:int, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1):
  self.eps, self.track_running_stats, self.momentum = eps, track_running_stats, momentum

  self.weight: Tensor|None = Tensor.ones(sz) if affine else None
  self.bias: Tensor|None = Tensor.zeros(sz) if affine else None

  self.num_batches_tracked = Tensor.zeros(1, dtype='long' if is_dtype_supported(dtypes.long) else 'int', requires_grad=False)
  if track_running_stats: self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)

一维卷积 ¤

Conv1d(
    in_channels: int,
    out_channels: int,
    kernel_size: int,
    stride=1,
    padding: int | str = 0,
    dilation=1,
    groups=1,
    bias=True,
) -> Conv2d

对由多个输入平面组成的输入信号应用一维卷积。

参见：https://pytorch.org/docs/stable/generated/torch.nn.Conv1d

conv = nn.Conv1d(1, 1, 3)
t = Tensor.rand(1, 1, 4)
print(t.numpy())

[[[0.8795 0.4541 0.0097 0.2058]]]

t = conv(t)
print(t.numpy())

[[[-0.4703 -0.4046]]]

Source code in tinygrad/nn/__init__.py

def Conv1d(in_channels:int, out_channels:int, kernel_size:int, stride=1, padding:int|str=0, dilation=1, groups=1, bias=True) -> Conv2d:
  """
  Applies a 1D convolution over an input signal composed of several input planes.

  See: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d

  ```python exec="true" source="above" session="tensor" result="python"
  conv = nn.Conv1d(1, 1, 3)
  t = Tensor.rand(1, 1, 4)
  print(t.numpy())
  ```
  ```python exec="true" source="above" session="tensor" result="python"
  t = conv(t)
  print(t.numpy())
  ```
  """
  return Conv2d(in_channels, out_channels, (kernel_size,), stride, padding, dilation, groups, bias)

Conv2d ¤

Conv2d(
    in_channels: int,
    out_channels: int,
    kernel_size: int | tuple[int, ...],
    stride=1,
    padding: int | tuple[int, ...] | str = 0,
    dilation=1,
    groups=1,
    bias=True,
)

对由多个输入平面组成的输入信号应用二维卷积。

参见：https://pytorch.org/docs/stable/generated/torch.nn.Conv2d

conv = nn.Conv2d(1, 1, 3)
t = Tensor.rand(1, 1, 4, 4)
print(t.numpy())

[[[[0.5956 0.5147 0.3872 0.0464]
   [0.2841 0.1229 0.127  0.7098]
   [0.6705 0.9473 0.0979 0.2868]
   [0.0245 0.9172 0.4672 0.1833]]]]

t = conv(t)
print(t.numpy())

[[[[-0.3339 -0.2574]
   [-0.6481 -0.3689]]]]

Source code in tinygrad/nn/__init__.py

def __init__(self, in_channels:int, out_channels:int, kernel_size:int|tuple[int, ...], stride=1, padding:int|tuple[int, ...]|str=0,
             dilation=1, groups=1, bias=True):
  self.kernel_size = make_tuple(kernel_size, 2)
  if isinstance(padding, str):
    if padding.lower() != 'same': raise ValueError(f"Invalid padding string {padding!r}, only 'same' is supported")
    if stride != 1: raise ValueError("padding='same' is not supported for strided convolutions")
    pad = [(d*(k-1)//2, d*(k-1) - d*(k-1)//2) for d,k in zip(make_tuple(dilation, len(self.kernel_size)), self.kernel_size[::-1])]
    padding = tuple(flatten(pad))
  self.stride, self.dilation, self.groups, self.padding = stride, dilation, groups, padding
  scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
  self.weight = Tensor.uniform(out_channels, in_channels//groups, *self.kernel_size, low=-scale, high=scale)
  self.bias: Tensor|None = Tensor.uniform(out_channels, low=-scale, high=scale) if bias else None

转置一维卷积 ¤

ConvTranspose1d(
    in_channels: int,
    out_channels: int,
    kernel_size: int,
    stride=1,
    padding=0,
    output_padding=0,
    dilation=1,
    groups=1,
    bias=True,
) -> ConvTranspose2d

对由多个输入平面组成的输入信号应用一维转置卷积算子。

参见：https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d

conv = nn.ConvTranspose1d(1, 1, 3)
t = Tensor.rand(1, 1, 4)
print(t.numpy())

[[[0.9025 0.3978 0.601  0.7668]]]

t = conv(t)
print(t.numpy())

[[[ 0.8251  0.248   0.1295  0.3344 -0.1166  0.1377]]]

Source code in tinygrad/nn/__init__.py

def ConvTranspose1d(in_channels:int, out_channels:int, kernel_size:int, stride=1, padding=0, output_padding=0, dilation=1,
                      groups=1, bias=True) -> ConvTranspose2d:
  """
  Applies a 1D transposed convolution operator over an input signal composed of several input planes.

  See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d

  ```python exec="true" source="above" session="tensor" result="python"
  conv = nn.ConvTranspose1d(1, 1, 3)
  t = Tensor.rand(1, 1, 4)
  print(t.numpy())
  ```
  ```python exec="true" source="above" session="tensor" result="python"
  t = conv(t)
  print(t.numpy())
  ```
  """
  return ConvTranspose2d(in_channels, out_channels, (kernel_size,), stride, padding, output_padding, dilation, groups, bias)

转置卷积2d ¤

ConvTranspose2d(
    in_channels: int,
    out_channels: int,
    kernel_size: int | tuple[int, ...],
    stride=1,
    padding=0,
    output_padding=0,
    dilation=1,
    groups=1,
    bias=True,
)

基类: Conv2d

在输入图像上应用二维转置卷积算子。

参见：https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d

conv = nn.ConvTranspose2d(1, 1, 3)
t = Tensor.rand(1, 1, 4, 4)
print(t.numpy())

[[[[0.6829 0.2292 0.1909 0.888 ]
   [0.6999 0.8085 0.1654 0.9843]
   [0.1998 0.2293 0.091  0.4039]
   [0.8206 0.4123 0.0447 0.3923]]]]

t = conv(t)
print(t.numpy())

[[[[-0.2541 -0.3563 -0.0659 -0.2146 -0.3248  0.0389]
   [-0.1354 -0.4135 -0.1794  0.0927 -0.4686  0.0287]
   [-0.0449 -0.0472 -0.2827  0.0577 -0.2218 -0.2035]
   [-0.1392 -0.1152  0.0947  0.0361 -0.1357 -0.1851]
   [-0.0776 -0.2038 -0.2736 -0.1436 -0.2096 -0.2815]
   [-0.1477 -0.0126 -0.1864 -0.2079 -0.1586 -0.2629]]]]

Source code in tinygrad/nn/__init__.py

def __init__(self, in_channels:int, out_channels:int, kernel_size:int|tuple[int, ...], stride=1, padding=0, output_padding=0,
              dilation=1, groups=1, bias=True):
  super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
  scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
  self.weight = Tensor.uniform(in_channels, out_channels//groups, *self.kernel_size, low=-scale, high=scale)
  self.output_padding = output_padding

线性层 ¤

Linear(in_features: int, out_features: int, bias=True)

对输入数据应用线性变换。

参见：https://pytorch.org/docs/stable/generated/torch.nn.Linear

lin = nn.Linear(3, 4)
t = Tensor.rand(2, 3)
print(t.numpy())

[[0.8863 0.7761 0.1925]
 [0.6193 0.3755 0.0885]]

t = lin(t)
print(t.numpy())

[[ 0.4911 -0.4845 -0.3609 -0.1215]
 [ 0.4408 -0.0994 -0.442  -0.3155]]

Source code in tinygrad/nn/__init__.py

def __init__(self, in_features:int, out_features:int, bias=True):
  bound = 1 / math.sqrt(in_features)
  self.weight = Tensor.uniform(out_features, in_features, low=-bound, high=bound)
  self.bias = Tensor.uniform(out_features, low=-bound, high=bound) if bias else None

GroupNorm ¤

GroupNorm(
    num_groups: int,
    num_channels: int,
    eps=1e-05,
    affine=True,
)

在小批量输入上应用组归一化。

描述：https://paperswithcode.com/method/group-normalization
论文: https://arxiv.org/abs/1803.08494v3

norm = nn.GroupNorm(2, 12)
t = Tensor.rand(2, 12, 4, 4) * 2 + 1
print(t.mean().item(), t.std().item())

2.013486385345459 0.5788871645927429

t = norm(t)
print(t.mean().item(), t.std().item())

-3.140890783015493e-07 1.0012893676757812

Source code in tinygrad/nn/__init__.py

def __init__(self, num_groups:int, num_channels:int, eps=1e-5, affine=True):
  self.num_groups, self.num_channels, self.eps = num_groups, num_channels, eps
  self.weight: Tensor|None = Tensor.ones(num_channels) if affine else None
  self.bias: Tensor|None = Tensor.zeros(num_channels) if affine else None

实例归一化 ¤

InstanceNorm(num_features: int, eps=1e-05, affine=True)

对输入的小批量数据应用实例归一化。

描述：https://paperswithcode.com/method/instance-normalization
论文: https://arxiv.org/abs/1607.08022v3

norm = nn.InstanceNorm(3)
t = Tensor.rand(2, 3, 4, 4) * 2 + 1
print(t.mean().item(), t.std().item())

1.931043028831482 0.5658571124076843

t = norm(t)
print(t.mean().item(), t.std().item())

4.189885416394645e-08 1.0052311420440674

Source code in tinygrad/nn/__init__.py

def __init__(self, num_features:int, eps=1e-5, affine=True):
  self.num_features, self.eps = num_features, eps
  self.weight: Tensor|None = Tensor.ones(num_features) if affine else None
  self.bias: Tensor|None = Tensor.zeros(num_features) if affine else None

层归一化 ¤

LayerNorm(
    normalized_shape: int | tuple[int, ...],
    eps=1e-05,
    elementwise_affine=True,
)

对输入的小批量应用层归一化。

描述：https://paperswithcode.com/method/layer-normalization
论文: https://arxiv.org/abs/1607.06450v1

norm = nn.LayerNorm(3)
t = Tensor.rand(2, 5, 3) * 2 + 1
print(t.mean().item(), t.std().item())

2.180391788482666 0.5629109144210815

t = norm(t)
print(t.mean().item(), t.std().item())

-3.212850572253956e-07 1.0170164108276367

Source code in tinygrad/nn/__init__.py

def __init__(self, normalized_shape:int|tuple[int, ...], eps=1e-5, elementwise_affine=True):
  self.normalized_shape: tuple[int, ...] = make_tuple(normalized_shape, 1)
  self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(self.normalized_shape))), eps, elementwise_affine
  self.weight: Tensor|None = Tensor.ones(*self.normalized_shape) if elementwise_affine else None
  self.bias: Tensor|None = Tensor.zeros(*self.normalized_shape) if elementwise_affine else None

LayerNorm2d ¤

LayerNorm2d(
    normalized_shape: int | tuple[int, ...],
    eps=1e-05,
    elementwise_affine=True,
)

基类: LayerNorm

对2D输入的小批量应用层归一化。

参见：LayerNorm

norm = nn.LayerNorm2d(3)
t = Tensor.rand(2, 3, 4, 4) * 2 + 1
print(t.mean().item(), t.std().item())

1.9809415340423584 0.5907070636749268

t = norm(t)
print(t.mean().item(), t.std().item())

-1.9173016596596426e-07 1.0051860809326172

Source code in tinygrad/nn/__init__.py

def __init__(self, normalized_shape:int|tuple[int, ...], eps=1e-5, elementwise_affine=True):
  self.normalized_shape: tuple[int, ...] = make_tuple(normalized_shape, 1)
  self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(self.normalized_shape))), eps, elementwise_affine
  self.weight: Tensor|None = Tensor.ones(*self.normalized_shape) if elementwise_affine else None
  self.bias: Tensor|None = Tensor.zeros(*self.normalized_shape) if elementwise_affine else None

RMSNorm ¤

RMSNorm(dim: int, eps=1e-06)

对输入应用均方根归一化。

描述：https://paperswithcode.com/method/rmsnorm
论文: https://arxiv.org/abs/1910.07467

norm = nn.RMSNorm(4)
t = Tensor.arange(12, dtype=dtypes.float).reshape(3, 4)
print(t.numpy())

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]

print(norm(t).numpy())

[[0.     0.5345 1.069  1.6036]
 [0.7127 0.8909 1.069  1.2472]
 [0.8363 0.9409 1.0454 1.15  ]]

Source code in tinygrad/nn/__init__.py

def __init__(self, dim:int, eps=1e-6): self.eps, self.weight = eps, Tensor.ones(dim)

嵌入 ¤

Embedding(vocab_size: int, embed_size: int)

一个简单的查找表，用于存储固定字典和大小的嵌入向量。

参见：https://pytorch.org/docs/stable/generated/torch.nn.Embedding

emb = nn.Embedding(10, 3)
print(emb(Tensor([1, 2, 3, 1])).numpy())

[[ 0.5506  0.2185 -0.3325]
 [-0.1563  0.527  -0.6618]
 [ 0.2488 -0.338  -0.4311]
 [ 0.5506  0.2185 -0.3325]]

Source code in tinygrad/nn/__init__.py

def __init__(self, vocab_size:int, embed_size:int):
  self.vocab_sz, self.embed_sz, self.weight = vocab_size, embed_size, Tensor.glorot_uniform(vocab_size, embed_size)

LSTMCell ¤

LSTMCell(
    input_size: int, hidden_size: int, bias: bool = True
)

长短期记忆（LSTM）单元。

参数:

input_size (int) –

输入x中预期的特征数量
hidden_size (int) –

隐藏状态h中的特征数量
bias (bool, default: True ) –

如果设为False，则该层不使用偏置权重b_ih和b_hh

Source code in tinygrad/nn/__init__.py

def __init__(self, input_size:int, hidden_size:int, bias:bool=True):
  stdv = 1.0 / math.sqrt(hidden_size)
  self.weight_ih = Tensor.uniform(hidden_size*4, input_size, low=-stdv, high=stdv)
  self.weight_hh = Tensor.uniform(hidden_size*4, hidden_size, low=-stdv, high=stdv)
  self.bias_ih: Tensor|None = Tensor.zeros(hidden_size*4) if bias else None
  self.bias_hh: Tensor|None = Tensor.zeros(hidden_size*4) if bias else None

优化器¤

SGD ¤

SGD(
    params: list[Tensor],
    lr=0.001,
    momentum=0.0,
    weight_decay=0.0,
    nesterov=False,
    classic=False,
)

随机梯度下降(SGD)优化器，可选动量和权重衰减。

classic 是一个布尔标志，用于确定是使用流行的动量更新规则还是经典动量更新规则。

描述：https://paperswithcode.com/method/sgd

Source code in tinygrad/nn/optim.py

def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
  """
  Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.

  `classic` is a boolean flag that determines whether to use the popular momentum update rule or the classic momentum update rule.

  - Described: https://paperswithcode.com/method/sgd
  """
  return LARS(params, lr, momentum, weight_decay, nesterov, classic, tcoef=0.0)

LARS ¤

LARS(
    params: list[Tensor],
    lr=0.001,
    momentum=0.9,
    weight_decay=0.0001,
    nesterov=False,
    classic=True,
    tcoef=0.001,
)

基类: Optimizer

层自适应学习率缩放（LARS）优化器，可选动量和权重衰减。

描述：https://paperswithcode.com/method/lars
论文: https://arxiv.org/abs/1708.03888v3

Source code in tinygrad/nn/optim.py

def __init__(self, params:list[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=False, classic=True, tcoef=0.001):
  super().__init__(params, lr)
  self.momentum, self.wd, self.nesterov, self.classic, self.tcoef = momentum, weight_decay, nesterov, classic, tcoef
  self.b = [Tensor.zeros(*t.shape, dtype=t.dtype, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []

AdamW ¤

AdamW(
    params: list[Tensor],
    lr=0.001,
    b1=0.9,
    b2=0.999,
    eps=1e-08,
    weight_decay=0.01,
)

带有可选权重衰减的AdamW优化器。

描述：https://paperswithcode.com/method/adamw
论文: https://arxiv.org/abs/1711.05101v3

Source code in tinygrad/nn/optim.py

def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
  """
  AdamW optimizer with optional weight decay.

  - Described: https://paperswithcode.com/method/adamw
  - Paper: https://arxiv.org/abs/1711.05101v3
  """
  return LAMB(params, lr, b1, b2, eps, weight_decay, adam=True)

Adam ¤

Adam(
    params: list[Tensor],
    lr=0.001,
    b1=0.9,
    b2=0.999,
    eps=1e-08,
)

Adam优化器。

描述：https://paperswithcode.com/method/adam
论文: https://arxiv.org/abs/1412.6980

Source code in tinygrad/nn/optim.py

def Adam(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
  """
  Adam optimizer.

  - Described: https://paperswithcode.com/method/adam
  - Paper: https://arxiv.org/abs/1412.6980
  """
  return LAMB(params, lr, b1, b2, eps, 0.0, adam=True)

LAMB ¤

LAMB(
    params: list[Tensor],
    lr=0.001,
    b1=0.9,
    b2=0.999,
    eps=1e-06,
    weight_decay=0.0,
    adam=False,
)

基类: Optimizer

带有可选权重衰减的LAMB优化器。

描述：https://paperswithcode.com/method/lamb
论文：https://arxiv.org/abs/1904.00962

Source code in tinygrad/nn/optim.py

def __init__(self, params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, adam=False):
  super().__init__(params, lr)
  self.b1, self.b2, self.eps, self.wd, self.adam = b1, b2, eps, weight_decay, adam
  self.b1_t, self.b2_t = (Tensor.ones((1,), dtype=dtypes.float32, device=self.device, requires_grad=False).contiguous() for _ in [b1, b2])
  self.m = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]
  self.v = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params]

加载/保存¤

safe_load ¤

safe_load(
    fn: Union[Tensor, str, Path],
) -> dict[str, Tensor]

加载一个.safetensor文件，返回state_dict。

state_dict = nn.state.safe_load("test.safetensor")

Source code in tinygrad/nn/state.py

def safe_load(fn:Union[Tensor, str, pathlib.Path]) -> dict[str, Tensor]:
  """
  Loads a .safetensor file, returning the `state_dict`.

  ```python
  state_dict = nn.state.safe_load("test.safetensor")
  ```
  """
  t, data_start, metadata = safe_load_metadata(fn)
  data = t[data_start:]
  return { k: data[v['data_offsets'][0]:v['data_offsets'][1]].bitcast(safe_dtypes[v['dtype']]).reshape(v['shape'])
          for k, v in metadata.items() if k != "__metadata__" }

safe_save ¤

safe_save(
    tensors: dict[str, Tensor],
    fn: str,
    metadata: Optional[dict[str, Any]] = None,
)

将state_dict保存到.safetensor格式的磁盘文件中，可包含可选元数据。

t = Tensor([1, 2, 3])
nn.state.safe_save({'t':t}, "test.safetensor")

Source code in tinygrad/nn/state.py

def safe_save(tensors:dict[str, Tensor], fn:str, metadata:Optional[dict[str, Any]]=None):
  """
  Saves a `state_dict` to disk in a .safetensor file with optional metadata.

  ```python
  t = Tensor([1, 2, 3])
  nn.state.safe_save({'t':t}, "test.safetensor")
  ```
  """
  headers, offset = {}, 0
  if metadata: headers['__metadata__'] = metadata
  for k,v in tensors.items():
    headers[k] = {'dtype': inverse_safe_dtypes[v.dtype], 'shape': list(v.shape), 'data_offsets':[offset, offset+v.nbytes()]}
    offset += v.nbytes()
  j = json.dumps(headers, separators=(',', ':'))
  j += "\x20"*(round_up(len(j),8)-len(j))
  pathlib.Path(fn).unlink(missing_ok=True)
  t = Tensor.empty(8+len(j)+offset, dtype=dtypes.uint8, device=f"disk:{fn}")
  t[0:8].bitcast(dtypes.int64).assign([len(j)])
  t[8:8+len(j)].assign(list(j.encode('utf-8')))
  for k,v in safe_load(t).items(): v.assign(tensors[k])

获取状态字典 ¤

get_state_dict(
    obj, prefix: str = "", tensor_type=Tensor
) -> dict[str, Tensor]

返回对象的state_dict，可选择添加前缀。

class Net:
  def __init__(self):
    self.l1 = nn.Linear(4, 5)
    self.l2 = nn.Linear(5, 6)

net = Net()
print(nn.state.get_state_dict(net).keys())

dict_keys(['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias'])

Source code in tinygrad/nn/state.py

def get_state_dict(obj, prefix:str='', tensor_type=Tensor) -> dict[str, Tensor]:
  """
  Returns a `state_dict` of the object, with optional prefix.

  ```python exec="true" source="above" session="tensor" result="python"
  class Net:
    def __init__(self):
      self.l1 = nn.Linear(4, 5)
      self.l2 = nn.Linear(5, 6)

  net = Net()
  print(nn.state.get_state_dict(net).keys())
  ```
  """
  if isinstance(obj, tensor_type): return {prefix.strip('.'):obj}
  if hasattr(obj, '_asdict'): return get_state_dict(obj._asdict(), prefix, tensor_type)  # namedtuple
  if isinstance(obj, OrderedDict): return get_state_dict(dict(obj), prefix, tensor_type)
  if hasattr(obj, '__dict__'): return get_state_dict(obj.__dict__, prefix, tensor_type)
  state_dict = {}
  if isinstance(obj, (list, tuple)):
    for i,x in enumerate(obj): state_dict.update(get_state_dict(x, f"{prefix}{str(i)}.", tensor_type))
  elif isinstance(obj, dict):
    for k,v in obj.items(): state_dict.update(get_state_dict(v, f"{prefix}{str(k)}.", tensor_type))
  return state_dict

获取参数 ¤

get_parameters(obj) -> list[Tensor]

class Net:
  def __init__(self):
    self.l1 = nn.Linear(4, 5)
    self.l2 = nn.Linear(5, 6)

net = Net()
print(len(nn.state.get_parameters(net)))

Source code in tinygrad/nn/state.py

def get_parameters(obj) -> list[Tensor]:
  """
  ```python exec="true" source="above" session="tensor" result="python"
  class Net:
    def __init__(self):
      self.l1 = nn.Linear(4, 5)
      self.l2 = nn.Linear(5, 6)

  net = Net()
  print(len(nn.state.get_parameters(net)))
  ```
  """
  return list(get_state_dict(obj).values())

load_state_dict ¤

load_state_dict(
    model,
    state_dict: dict[str, Tensor],
    strict=True,
    verbose=True,
    consume=False,
    realize=True,
) -> None

将 state_dict 加载到模型中。

class Net:
  def __init__(self):
    self.l1 = nn.Linear(4, 5)
    self.l2 = nn.Linear(5, 6)

net = Net()
state_dict = nn.state.get_state_dict(net)
nn.state.load_state_dict(net, state_dict)

Source code in tinygrad/nn/state.py

def load_state_dict(model, state_dict:dict[str, Tensor], strict=True, verbose=True, consume=False, realize=True) -> None:
  """
  Loads a `state_dict` into a model.

  ```python
  class Net:
    def __init__(self):
      self.l1 = nn.Linear(4, 5)
      self.l2 = nn.Linear(5, 6)

  net = Net()
  state_dict = nn.state.get_state_dict(net)
  nn.state.load_state_dict(net, state_dict)
  ```
  """
  start_mem_used = GlobalCounters.mem_used
  with Timing("loaded weights in ",
              lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s", enabled=verbose):
    model_state_dict = get_state_dict(model)
    if DEBUG >= 1 and len(state_dict) > len(model_state_dict):
      print("WARNING: unused weights in state_dict", sorted(list(state_dict.keys() - model_state_dict.keys())))
    for k,v in (t := tqdm(model_state_dict.items(), disable=CI or not verbose)):
      t.desc = f"ram used: {GlobalCounters.mem_used/1e9:5.2f} GB, {k:50s}: "
      if k not in state_dict and not strict:
        if DEBUG >= 1: print(f"WARNING: not loading {k}")
        continue
      if v.shape != state_dict[k].shape:
        raise ValueError(f'Shape mismatch in layer `{k}`: Expected shape {v.shape}, but found {state_dict[k].shape} in state dict.')
      if isinstance(v.device, tuple):
        if isinstance(state_dict[k].device, tuple): v.replace(state_dict[k])
        else: v.replace(state_dict[k].shard(v.device, v.lazydata.axis))
      else: v.replace(state_dict[k].to(v.device))
      if realize: v.realize()
      if consume: del state_dict[k]

`tar_extract` ¤

tar_extract(fn: Tensor | str | Path) -> dict[str, Tensor]

从tar存档中提取文件，并将它们作为名称（键）和张量（值）的字典返回。

tensors = nn.state.tar_extract(Tensor(pathlib.Path("archive.tar")))

Source code in tinygrad/nn/state.py

@accept_filename
def tar_extract(t: Tensor) -> dict[str, Tensor]:
  """
  ```python
  tar_extract(fn: Tensor | str | Path) -> dict[str, Tensor]
  ```

  Extracts files from a tar archive and returns them as a dictionary of names (keys) and tensors (values).

  ```python
  tensors = nn.state.tar_extract(Tensor(pathlib.Path("archive.tar")))
  ```
  """
  with tarfile.open(fileobj=TensorIO(t), mode="r") as tar:
    return {member.name:t[member.offset_data:member.offset_data+member.size] for member in tar if member.type == tarfile.REGTYPE}

`torch_load` ¤

torch_load(fn: Tensor | str | Path) -> dict[str, Tensor]

加载一个torch .pth文件，返回state_dict。

state_dict = nn.state.torch_load("test.pth")

Source code in tinygrad/nn/state.py

@accept_filename
def torch_load(t:Tensor) -> dict[str, Tensor]:
  """
  ```python
  torch_load(fn: Tensor | str | Path) -> dict[str, Tensor]
  ```

  Loads a torch .pth file, returning the `state_dict`.

  ```python
  state_dict = nn.state.torch_load("test.pth")
  ```
  """
  offsets: dict[Union[str, int], int] = {}
  lens: dict[Union[str, int], int] = {}
  def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad=None, backward_hooks=None, metadata=None):
    #print(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata)
    lens[storage[2]] = storage[4] * storage[1].itemsize
    if storage[2] not in offsets: return None
    byte_offset = offsets[storage[2]]+storage_offset*storage[1].itemsize
    ret = t[byte_offset:byte_offset+prod(size)*storage[1].itemsize].bitcast(storage[1])

    # 7 lines to deal with permuted tensors. NOTE: this currently requires reading off the disk
    shape_strides = [(s, st) for s,st in zip(size, stride) if s != 1]
    permute_indexes = [len(shape_strides)-1-y for y in argsort([x[1] for x in shape_strides])]
    if tuple(permute_indexes) != tuple(range(len(permute_indexes))):
      intermediate_shape = tuple([shape_strides[x][0] for x in argsort(permute_indexes)])
      assert tuple([shape_strides[i][1] for i in argsort(permute_indexes)]) == strides_for_shape(intermediate_shape), "nonpermutable strides"
      if DEBUG >= 3: print(f"WARNING: this torch load is slow. to permute {intermediate_shape} with {permute_indexes}")
      assert storage[1] != dtypes.bfloat16, "can't permute BF16"
      # TODO: find a nice way to support all shapetracker on disktensors
      ret = ret.to(None).reshape(intermediate_shape).permute(permute_indexes)

    return ret.reshape(size)

  class Parameter:
    def __setstate__(self, state): self.tensor = state[0]

  deserialized_objects: dict[str, Any] = {}
  intercept = {"HalfStorage": dtypes.float16, "FloatStorage": dtypes.float32, "BFloat16Storage": dtypes.bfloat16,
               "IntStorage": dtypes.int32, "BoolStorage": dtypes.bool,
               "LongStorage": dtypes.int64, "_rebuild_tensor_v2": _rebuild_tensor_v2, "FloatTensor": None, "Parameter": Parameter}
  whitelist = {"torch", "collections", "numpy", "_codecs"}  # NOTE: this is not for security, only speed
  class Dummy: pass
  class TorchPickle(pickle.Unpickler):
    def find_class(self, module, name):
      module_root = module.split(".")[0]
      if module_root not in whitelist:
        if DEBUG >= 2: print(f"WARNING: returning Dummy for {module} {name}")
        return Dummy
      return intercept[name] if module_root == "torch" else super().find_class(module, name)
    def persistent_load(self, pid): return deserialized_objects.get(pid, pid)

  fobj = io.BufferedReader(TensorIO(t))
  def passthrough_reset(v: bool): return fobj.seek(0, 0) or v

  if passthrough_reset(zipfile.is_zipfile(fobj)): # NOTE: passthrough_reset required to support python < 3.14
    myzip = zipfile.ZipFile(fobj, 'r')
    base_name = myzip.namelist()[0].split('/', 1)[0]
    for n in myzip.namelist():
      if n.startswith(f'{base_name}/data/'):
        with myzip.open(n) as myfile:
          offsets[n.split("/")[-1]] = myfile._orig_compress_start # type: ignore
    with myzip.open(f'{base_name}/data.pkl') as myfile:
      return TorchPickle(myfile).load()
  elif passthrough_reset(tarfile.is_tarfile(fobj)): # NOTE: passthrough_reset required to support python < 3.11
    with tarfile.open(fileobj=fobj, mode="r") as tar:
      storages_offset = tar.getmember('storages').offset_data
      f = unwrap(tar.extractfile('storages'))
      for i in range(TorchPickle(f).load()):  # num_storages
        (key, _, storage_type), sz = TorchPickle(f).load(), struct.unpack('<q', f.read(8))[0]
        offsets[key] = storages_offset + f.tell()
        f.seek(sz*storage_type.itemsize, 1)
      f = unwrap(tar.extractfile('tensors'))
      for _ in range(TorchPickle(f).load()):  # num_tensors
        (key, storage_id, _), ndim, _ = TorchPickle(f).load(), struct.unpack('<i', f.read(4))[0], f.read(4)
        size, stride = struct.unpack(f'<{ndim}q', f.read(8 * ndim)), struct.unpack(f'<{ndim}q', f.read(8 * ndim))
        storage_offset = struct.unpack('<q', f.read(8))[0]
        deserialized_objects[str(key)] = _rebuild_tensor_v2((None, storage_type, storage_id, None, -1), storage_offset, size, stride)
      return {k:v.tensor if isinstance(v, Parameter) else v for k,v in TorchPickle(unwrap(tar.extractfile('pickle'))).load().items()}
  else:
    pkl = TorchPickle(fobj)
    _, _, _, rwd, _, ids, base_offset = pkl.load(), pkl.load(), pkl.load(), fobj.tell(), pkl.load(), pkl.load(), fobj.tell()
    for i in ids:
      offsets[i] = base_offset + 8
      base_offset += 8 + lens[i]
    fobj.seek(rwd)
    return TorchPickle(fobj).load()

gguf加载 ¤

gguf_load(tensor: Tensor) -> tuple[dict, dict[str, Tensor]]

加载一个.gguf文件，返回kv_data和state_dict。

gguf_tensor = Tensor(pathlib.Path("Meta-Llama-3-8B-Instruct.Q4_0.gguf")).to(Device.DEFAULT)
kv_data, state_dict = nn.state.gguf_load(gguf_tensor)

注意

提供的张量必须位于支持执行的设备上。

Source code in tinygrad/nn/state.py

@accept_filename
def gguf_load(tensor: Tensor) -> tuple[dict, dict[str, Tensor]]:
  """
  Loads a .gguf file, returning the `kv_data` and `state_dict`.

  ```python
  gguf_tensor = Tensor(pathlib.Path("Meta-Llama-3-8B-Instruct.Q4_0.gguf")).to(Device.DEFAULT)
  kv_data, state_dict = nn.state.gguf_load(gguf_tensor)
  ```

  NOTE: The provided tensor must be on a device that supports execution.
  """
  reader, kv_data, state_dict = io.BufferedReader(TensorIO(tensor), 1_000_000), {}, {}
  def read_unpack(fmt: str, n: int): return struct.unpack(fmt, reader.read(n))[0]
  def read_str(): return str(reader.read(read_uint64()), "utf-8")
  def read_arr():
    reader, n = readers[read_int32()], read_uint64()
    return [ reader() for _ in range(n) ]

  readers: dict[int, Callable[[], Any]] = { 8: read_str, 9: read_arr, **{ t: functools.partial(read_unpack, "<"+f, nb) for t,f,nb in \
    [ (0,"c",1), (1,"b",1), (2,"H",2), (3,"h",2), (4,"I",4), (5,"i",4), (6,"f",4), (7,"?",1), (10,"Q",8), (11,"q",8), (12,"d",8) ] } }
  read_uint32, read_int32, read_uint64, read_int64 = readers[4], readers[5], readers[10], readers[11]

  magic, version, n_tensors, n_kv = reader.read(4), read_int32(), read_int64(), read_int64()
  if magic != b"GGUF" or version not in [2, 3]: raise ValueError("Invalid GGUF format!")
  for _ in range(n_kv):
    k, typ = read_str(), read_int32()
    kv_data[k] = readers[typ]()

  t_infos = [ (read_str(), tuple(read_uint64() for _ in range(read_uint32())), read_int32(), read_uint64()) for _ in range(n_tensors) ]
  alignment, pos = kv_data.get("general.alignment", 32), reader.tell()
  data_start = round_up(pos, alignment)

  for name, dims, typ, off in t_infos: state_dict[name] = ggml_data_to_tensor(tensor[data_start + off:], prod(dims), typ).reshape(*reversed(dims))

  return kv_data, state_dict

nn (神经网络)

神经网络类¤

批量归一化 ¤

一维卷积 ¤

Conv2d ¤

转置一维卷积 ¤

转置卷积2d ¤

线性层 ¤

GroupNorm ¤

实例归一化 ¤

层归一化 ¤

LayerNorm2d ¤

RMSNorm ¤

嵌入 ¤

LSTMCell ¤

优化器¤

SGD ¤

LARS ¤

AdamW ¤

Adam ¤

LAMB ¤

加载/保存¤

safe_load ¤

safe_save ¤

获取状态字典 ¤

获取参数 ¤

load_state_dict ¤

tar_extract ¤

torch_load ¤

gguf加载 ¤

`tar_extract` ¤

`torch_load` ¤