diff --git a/gpt.py b/gpt.py index e4fc68d6..9e9cf9ba 100644 --- a/gpt.py +++ b/gpt.py @@ -103,7 +103,7 @@ def forward(self, x): out = self.dropout(self.proj(out)) return out -class FeedFoward(nn.Module): +class FeedForward(nn.Module): """ a simple linear layer followed by a non-linearity """ def __init__(self, n_embd): @@ -126,7 +126,7 @@ def __init__(self, n_embd, n_head): super().__init__() head_size = n_embd // n_head self.sa = MultiHeadAttention(n_head, head_size) - self.ffwd = FeedFoward(n_embd) + self.ffwd = FeedForward(n_embd) self.ln1 = nn.LayerNorm(n_embd) self.ln2 = nn.LayerNorm(n_embd)