diff --git a/src/components/Attention.svelte b/src/components/Attention.svelte
index 0ea8df8..db38bff 100644
--- a/src/components/Attention.svelte
+++ b/src/components/Attention.svelte
@@ -47,11 +47,11 @@
 
 <div class={classNames('attention', className, { expanded: isAttentionExpanded })}>
 	<div class="title" on:mouseenter={handleMouseEnter} on:mouseleave={handleMouseLeave} role="group">
-		<div>Multi-head Self Attention</div>
+		<div>多头自注意力</div>
 	</div>
 	<div class="content relative">
 		<div class="bounding transformer-bounding" class:active={$isBoundingBoxActive}>
-			<div class="bounding-title">Transformer Block 1</div>
+			<div class="bounding-title">Transformer 块 1</div>
 		</div>
 		<div
 			class="bounding attention-bounding"
@@ -158,7 +158,7 @@
 					</div>
 					<div class="head-out mx-[2rem]">
 						<div class="column out">
-							<div class="title">Out</div>
+							<div class="title">输出</div>
 							{#each $tokens as token, index}
 								<div class="head1 cell x1-12" class:last={index === $tokens.length - 1}>
 									<div class={`vector x1-12 ${outputVectorColor}`}></div>
diff --git a/src/components/AttentionMatrix.svelte b/src/components/AttentionMatrix.svelte
index d2a06c0..0f4bf68 100644
--- a/src/components/AttentionMatrix.svelte
+++ b/src/components/AttentionMatrix.svelte
@@ -276,7 +276,7 @@
 				shape={'circle'}
 				colorScale={qkColorScale}
 			/>
-			<div class="matrix-label">Dot product</div>
+			<div class="matrix-label">点积</div>
 			<Tooltip class="popover tooltip">
 				<Katex math={'Q \\cdot K^T'}></Katex>
 			</Tooltip>
@@ -333,7 +333,7 @@
 					colorScale={maskedColorScale}
 				/>
 			</div>
-			<div class="matrix-label">Scaling · Mask</div>
+			<div class="matrix-label">缩放 · 掩码</div>
 			<Tooltip class="popover tooltip">
 				<Katex math={'\\frac{QK^T}{\\sqrt{d_k}} + M'}></Katex>
 			</Tooltip>
@@ -393,7 +393,7 @@
 				/>
 			</div>
 
-			<div class="matrix-label">Softmax · Dropout</div>
+			<div class="matrix-label">归一化指数函数 · 暂退</div>
 			<Tooltip class="popover tooltip">
 				<Katex math={'\\text{Dropout}(\\text{softmax}(\\frac{QK^T}{\\sqrt{d_k}} + M))'}></Katex>
 			</Tooltip>
@@ -423,7 +423,7 @@
 				colorScale={softmaxColorScale}
 			/>
 
-			<div class="matrix-label">Attention</div>
+			<div class="matrix-label">注意力</div>
 		</div>
 	</div>
 </div>
diff --git a/src/components/Embedding.svelte b/src/components/Embedding.svelte
index d5d14d9..b3ae56a 100644
--- a/src/components/Embedding.svelte
+++ b/src/components/Embedding.svelte
@@ -118,7 +118,7 @@
 		on:mouseenter={handleMouseEnter}
 		on:mouseleave={handleMouseLeave}
 	>
-		<div>Embedding</div>
+		<div>嵌入</div>
 	</div>
 	<div class="content relative">
 		<div class="token-column resizable resize-watch flex">
@@ -142,10 +142,10 @@
 				<!-- token id and embedding -->
 				<div class="column token-embedding embedding-detail">
 					<div class="subtitle flex items-center gap-1">
-						<span>Token<br />Embedding</span><HelpPopover
+						<span>Token 嵌入</span><HelpPopover
 							id="token-embedding"
 							goTo="article-token-embedding"
-							>{`Converts tokens into \nsemantically meaningful \nnumerical representations.`}</HelpPopover
+							>{`将 Token 转换为具有语义意义的数字表示`}</HelpPopover
 						>
 					</div>
 					{#each $tokens as token, index}
@@ -205,10 +205,10 @@
 				</div>
 				<div class="column embedding-detail position-embedding">
 					<div class="subtitle flex gap-1">
-						<span>Positional<br />Encoding</span><HelpPopover
+						<span>位置编码</span><HelpPopover
 							id="position-embedding"
 							goTo="article-positional-embedding"
-							>{`Encodes positional \ninformation of tokens into \nnumerical representations.`}</HelpPopover
+							>{`将标记的位置信息编码为数字表示`}</HelpPopover
 						>
 					</div>
 					{#each $tokens as token, index}
diff --git a/src/components/HelpPopover.svelte b/src/components/HelpPopover.svelte
index b6a4796..d08bc1f 100644
--- a/src/components/HelpPopover.svelte
+++ b/src/components/HelpPopover.svelte
@@ -35,7 +35,7 @@
 		<slot />
 		{#if goTo}
 			<div class="more-btn mt-1 text-blue-600 hover:underline" on:click={scrollToDiv}>
-				Read more
+				阅读更多
 			</div>
 		{/if}
 	</div></Popover
diff --git a/src/components/InputForm.svelte b/src/components/InputForm.svelte
index 214f278..7643133 100644
--- a/src/components/InputForm.svelte
+++ b/src/components/InputForm.svelte
@@ -128,7 +128,7 @@
 				class:selectDisabled
 				class="select-button inline-flex shrink-0 items-center justify-center border border-s-0 border-gray-200 bg-white px-3 py-2 text-center text-xs font-medium text-gray-900 first:rounded-s-lg first:border-s last:rounded-e-lg"
 			>
-				Examples<ChevronDownOutline class="pointer-events-none h-4 w-4 text-gray-500" />
+				示例<ChevronDownOutline class="pointer-events-none h-4 w-4 text-gray-500" />
 			</button>
 			<Dropdown placement="bottom-start" bind:open={dropdownOpen} class="example-dropdown">
 				{#each inputTextExample as text, index}
@@ -185,10 +185,10 @@
 				{/if}
 				{#if $isLoaded && $isFetchingModel}
 					<span class="helper-text"
-						>Try the examples while GPT-2 model is being downloaded (600MB)</span
+						>T在下载 GPT-2 模型时尝试示例（600MB）</span
 					>
 				{:else if exceedLimit}
-					<span class="helper-text">You can enter up to {wordLimit} words.</span>
+					<span class="helper-text">你最多输入 {wordLimit} 个词</span>
 				{/if}
 			</div>
 		</ButtonGroup>
@@ -201,7 +201,7 @@
 			type="submit"
 			on:click={handleSubmit}
 		>
-			Generate
+			生成
 		</button>
 	</form>
 	<Temperature disabled={isLoading} />
diff --git a/src/components/LinearSoftmax.svelte b/src/components/LinearSoftmax.svelte
index dfa7783..2439dc2 100644
--- a/src/components/LinearSoftmax.svelte
+++ b/src/components/LinearSoftmax.svelte
@@ -150,7 +150,7 @@
 		on:mouseenter={handleMouseEnter}
 		on:mouseleave={handleMouseLeave}
 	>
-		<div>Probabilities</div>
+		<div>概率</div>
 	</div>
 	<div
 		class="content resize-watch relative"
@@ -188,14 +188,14 @@
 					<div class="title-box logits">
 						<div class="title-text">Logits</div>
 						<Tooltip class="popover tooltip text-xs"
-							><Katex math={'\\text{logits} = \\text{hidden state} \\times \\text{LM Head Weights}'}
+							><Katex math={'\\text{logits} = \\text{隐藏状态} \\times \\text{LM 头权重}'}
 							></Katex></Tooltip
 						>
 					</div>
 					<div class="title-box exponents">
-						<div class="title-text">Exponents</div>
+						<div class="title-text">指数</div>
 						<Tooltip class="popover tooltip"
-							><Katex math={'e^{logit_i / temperature}'}></Katex></Tooltip
+							><Katex math={'e^{logit_i / 温度}'}></Katex></Tooltip
 						>
 					</div>
 					<div class="title-box probability">
diff --git a/src/components/Mlp.svelte b/src/components/Mlp.svelte
index 34a6173..15e1682 100644
--- a/src/components/Mlp.svelte
+++ b/src/components/Mlp.svelte
@@ -30,7 +30,7 @@
 
 <div class={classNames('mlp', className)}>
 	<div class="title" on:mouseenter={handleMouseEnter} on:mouseleave={handleMouseLeave} role="group">
-		MLP
+		多层感知器 MLP
 	</div>
 	<div class="content relative">
 		<div class="bounding transformer-bounding" class:active={$isBoundingBoxActive}></div>
diff --git a/src/components/Operation.svelte b/src/components/Operation.svelte
index 768b326..cb1a94b 100644
--- a/src/components/Operation.svelte
+++ b/src/components/Operation.svelte
@@ -62,7 +62,7 @@
 				<path
 					d="M27.3536 4.35355C27.5488 4.15829 27.5488 3.84171 27.3536 3.64645L24.1716 0.46447C23.9763 0.269208 23.6597 0.269208 23.4645 0.464471C23.2692 0.659733 23.2692 0.976316 23.4645 1.17158L26.2929 4L23.4645 6.82843C23.2692 7.02369 23.2692 7.34028 23.4645 7.53554C23.6597 7.7308 23.9763 7.7308 24.1716 7.53554L27.3536 4.35355ZM27 3.5C20.3125 3.50001 15.2969 6.63952 11.5638 11.7944C7.84616 16.9278 5.39057 24.0677 3.75991 32.1346C0.498061 48.2711 0.5 68.3247 0.5 84H1.5C1.5 68.2984 1.50194 48.352 4.74009 32.3328C6.35943 24.3218 8.77884 17.3448 12.3737 12.3809C15.9531 7.4384 20.6875 4.50001 27 4.5V3.5Z"
 				/>
-				<text x="4" y="95" text-anchor="middle">Layer Normalization</text>
+				<text x="4" y="95" text-anchor="middle">层归一化</text>
 			</svg>{/if}
 	</div>
 {:else if type === 'residual-start'}
@@ -70,7 +70,7 @@
 		<div class="cursor"></div>
 		<svg class="main">
 			{#if head}<path {id} class="head" d="M0,0 Q0,-16 30,-16"></path>
-				<text x="30" y="-26" dy="4" dx="4">Residual</text>{/if}
+				<text x="30" y="-26" dy="4" dx="4">残差</text>{/if}
 			<path d={`M0,0 L0,${$vectorHeight}`}></path>
 		</svg>
 	</div>
diff --git a/src/components/Popovers/ActivationPopover.svelte b/src/components/Popovers/ActivationPopover.svelte
index fc3a3aa..f1b01d8 100644
--- a/src/components/Popovers/ActivationPopover.svelte
+++ b/src/components/Popovers/ActivationPopover.svelte
@@ -11,7 +11,7 @@
 
 <CommonPopover
 	className="activation-popover"
-	title="GeLu Activation"
+	title="GeLu 激活"
 	{offset}
 	{triggeredBy}
 	{trigger}
@@ -19,7 +19,7 @@
 	goTo="article-activation"
 >
 	<div class="activation-content">
-		Applies activation function to neuron outputs.
+		将激活函数应用于神经元输出。
 	</div></CommonPopover
 >
 
diff --git a/src/components/Popovers/CommonPopover.svelte b/src/components/Popovers/CommonPopover.svelte
index 985f6e5..3688b6e 100644
--- a/src/components/Popovers/CommonPopover.svelte
+++ b/src/components/Popovers/CommonPopover.svelte
@@ -42,7 +42,7 @@
 		<slot></slot>
 		{#if goTo}
 			<div class="more-btn mt-1 text-blue-600 hover:underline" on:click={scrollToDiv}>
-				Read more
+				阅读更多
 			</div>
 		{/if}
 	</div></Popover
diff --git a/src/components/Popovers/DropoutPopover.svelte b/src/components/Popovers/DropoutPopover.svelte
index 6a4e959..b7bfdf2 100644
--- a/src/components/Popovers/DropoutPopover.svelte
+++ b/src/components/Popovers/DropoutPopover.svelte
@@ -14,12 +14,12 @@
 
 <CommonPopover
 	className="dropout-popover"
-	title="Dropout"
+	title="Dropout 暂退法"
 	{offset}
 	{triggeredBy}
 	{trigger}
 	{placement}
 	goTo="article-dropout"
 >
-	<div class="dropout-content">Disables randomly selected neurons.</div></CommonPopover
+	<div class="dropout-content">禁用随机选择的神经元</div></CommonPopover
 >
diff --git a/src/components/Popovers/LayerNormPopover.svelte b/src/components/Popovers/LayerNormPopover.svelte
index f0a8094..50b5304 100644
--- a/src/components/Popovers/LayerNormPopover.svelte
+++ b/src/components/Popovers/LayerNormPopover.svelte
@@ -14,7 +14,7 @@
 
 <CommonPopover
 	className="ln-popover"
-	title="Layer Normalization"
+	title="层归一化"
 	{offset}
 	{triggeredBy}
 	{trigger}
@@ -22,6 +22,6 @@
 	goTo="article-ln"
 >
 	<div class="ln-content w-[10rem]">
-		Standardizes layer inputs to maintain consistent mean and variance.
+		标准化层输入以保持一致的平均值和方差
 	</div></CommonPopover
 >
diff --git a/src/components/Popovers/MLPWeightPopover.svelte b/src/components/Popovers/MLPWeightPopover.svelte
index 26c81e8..1db3268 100644
--- a/src/components/Popovers/MLPWeightPopover.svelte
+++ b/src/components/Popovers/MLPWeightPopover.svelte
@@ -283,7 +283,7 @@
 				</div>
 			</div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title self-end">Token Embedding</div>
+				<div class="title self-end">Token 嵌入</div>
 				<!-- (tokenLen, 768) -->
 				<Matrix
 					className="token-embedding"
@@ -298,7 +298,7 @@
 			</div>
 			<div class="operator"><div class="symbol">&times;</div></div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title">Q·K·V Weights</div>
+				<div class="title">Q·K·V 权重</div>
 				<!-- (768,2034) -->
 				<div class="flex gap-0">
 					<Matrix
@@ -317,7 +317,7 @@
 			</div>
 			<div class="operator"><div class="symbol">+</div></div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title">bias</div>
+				<div class="title">偏置</div>
 				<!-- (768) -->
 				<Matrix
 					className="qkv-bias"
diff --git a/src/components/Popovers/PositionalEncodingPopover.svelte b/src/components/Popovers/PositionalEncodingPopover.svelte
index fb903e4..93b83bb 100644
--- a/src/components/Popovers/PositionalEncodingPopover.svelte
+++ b/src/components/Popovers/PositionalEncodingPopover.svelte
@@ -89,7 +89,7 @@
 				</div>
 			</div>
 			<div class="positions">
-				<div class="subtitle">Position</div>
+				<div class="subtitle">位置</div>
 				<div class="subcontent flex flex-col">
 					{#each $tokens as token, token_idx}
 						<div
@@ -105,7 +105,7 @@
 				</div>
 			</div>
 			<div class="embs">
-				<div class="subtitle">Embedding</div>
+				<div class="subtitle">嵌入</div>
 				<div class="subcontent flex flex-col">
 					{#each $tokens as token, token_idx}
 						<div
@@ -132,7 +132,7 @@
 			</div>
 		</div>
 		<div class="viz-container">
-			<div class="subtitle">Encoding Matrix</div>
+			<div class="subtitle">编码矩阵</div>
 			<div class="subcontent viz-subcontent">
 				<div class="chart-container mx-auto">
 					<div class="embed-dim-axis">
diff --git a/src/components/Popovers/QKVWeightPopover.svelte b/src/components/Popovers/QKVWeightPopover.svelte
index 8b9f42d..fefb0b2 100644
--- a/src/components/Popovers/QKVWeightPopover.svelte
+++ b/src/components/Popovers/QKVWeightPopover.svelte
@@ -221,7 +221,7 @@
 	<div
 		class="weight-popover-title rounded-t-md border-b border-gray-200 bg-gray-100 px-3 py-2 dark:border-gray-600 dark:bg-gray-700"
 	>
-		<h3 class="font-semibold text-gray-900">QKV Calculation</h3>
+		<h3 class="font-semibold text-gray-900">QKV 计算过程</h3>
 		{#if isAnimationActive}
 			<button
 				class="play-control forward"
@@ -283,7 +283,7 @@
 				</div>
 			</div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title self-end">Embedding</div>
+				<div class="title self-end">嵌入</div>
 				<!-- (tokenLen, 768) -->
 				<Matrix
 					className="token-embedding"
@@ -298,7 +298,7 @@
 			</div>
 			<div class="operator"><div class="symbol">&times;</div></div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title">Q·K·V Weights</div>
+				<div class="title">Q·K·V 权重</div>
 				<!-- (768,2034) -->
 				<div class="flex gap-0">
 					<Matrix
@@ -317,7 +317,7 @@
 			</div>
 			<div class="operator"><div class="symbol">+</div></div>
 			<div class="matrix flex flex-col items-center">
-				<div class="title">Bias</div>
+				<div class="title">偏置</div>
 				<!-- (768) -->
 				<Matrix
 					className="qkv-bias"
diff --git a/src/components/Popovers/ResidualPopover.svelte b/src/components/Popovers/ResidualPopover.svelte
index 1a76660..fb686b9 100644
--- a/src/components/Popovers/ResidualPopover.svelte
+++ b/src/components/Popovers/ResidualPopover.svelte
@@ -12,7 +12,7 @@
 
 <CommonPopover
 	className="residual-popover"
-	title="Residual Connection"
+	title="残差连接"
 	{offset}
 	{triggeredBy}
 	{trigger}
@@ -21,7 +21,7 @@
 	goTo="article-residual"
 >
 	<div class="residual-content">
-		Adds skip-connections to allow for better gradient flow.
+		添加跳过连接以实现更好的梯度流
 	</div></CommonPopover
 >
 
diff --git a/src/components/SubsequentBlocks.svelte b/src/components/SubsequentBlocks.svelte
index 816f885..b6d0841 100644
--- a/src/components/SubsequentBlocks.svelte
+++ b/src/components/SubsequentBlocks.svelte
@@ -32,9 +32,9 @@
 				</svg> -->
 
 				<div class="text" class:active={$isBoundingBoxActive}>
-					{$modelMeta.layer_num - 1} more identical<br /><span class="highlight"
-						>Transformer<br />Blocks</span
-					>.
+					{$modelMeta.layer_num - 1} 更相同的<br /><span class="highlight"
+						>Transformer 块</span
+					>
 				</div>
 				<svg
 					xmlns="http://www.w3.org/2000/svg"
diff --git a/src/components/Temperature.svelte b/src/components/Temperature.svelte
index feb56d5..e2b8fdc 100644
--- a/src/components/Temperature.svelte
+++ b/src/components/Temperature.svelte
@@ -22,9 +22,9 @@
 	<div class="slider-container flex w-full flex-col items-end">
 		<div class="flex w-full shrink-0 items-center justify-between">
 			<div class="temperature-text flex items-center gap-[2px]">
-				<div>Temperature</div>
+				<div>Temperature(温度)</div>
 				<HelpPopover id="temperature-help" placement="right" goTo="article-prob">
-					{`Changes the output \nprobability distribution \nand randomness \nof next token.`}
+					{`改变下一个 token 的输出概率分布和随机性.`}
 				</HelpPopover>
 			</div>
 			<div class="temperature-value">
diff --git a/src/components/article/Article.svelte b/src/components/article/Article.svelte
index 2deda20..fbda609 100644
--- a/src/components/article/Article.svelte
+++ b/src/components/article/Article.svelte
@@ -15,472 +15,380 @@
 
 <div id="description">
 	<div class="article-section">
-		<h1>What is a Transformer?</h1>
+		<h1>什么是 Transformer？</h1>
 
 		<p>
-			Transformer is a neural network architecture that has fundamentally changed the approach to
-			Artificial Intelligence. Transformer was first introduced in the seminal paper
+			Transformer 首次出现在 2017 年的开创性论文
 			<a
 				href="https://dl.acm.org/doi/10.5555/3295222.3295349"
 				title="ACM Digital Library"
-				target="_blank">"Attention is All You Need"</a
+				target="_blank">《Attention is All You Need》</a
 			>
-			in 2017 and has since become the go-to architecture for deep learning models, powering text-generative
-			models like OpenAI's <strong>GPT</strong>, Meta's <strong>Llama</strong>, and Google's
-			<strong>Gemini</strong>. Beyond text, Transformer is also applied in
+			中，此后已成为深度学习模型的首选架构，为 OpenAI
+			的 <strong>GPT</strong>、Meta 的 <strong>Llama</strong> 和 Google 的
+			<strong>Gemini</strong> 等文本生成模型提供支持。
+			除了文本之外，Transformer 还应用于
 			<a
 				href="https://huggingface.co/learn/audio-course/en/chapter3/introduction"
 				title="Hugging Face"
-				target="_blank">audio generation</a
-			>,
+				target="_blank">音频生成</a
+			>、
 			<a
 				href="https://huggingface.co/learn/computer-vision-course/unit3/vision-transformers/vision-transformers-for-image-classification"
 				title="Hugging Face"
-				target="_blank">image recognition</a
-			>,
+				target="_blank">图像识别</a
+			>、
 			<a href="https://elifesciences.org/articles/82819" title="eLife"
-				>protein structure prediction</a
-			>, and even
+				>蛋白质结构预测</a
+			>，甚至
 			<a
 				href="https://www.deeplearning.ai/the-batch/reinforcement-learning-plus-transformers-equals-efficiency/"
 				title="Deep Learning AI"
-				target="_blank">game playing</a
-			>, demonstrating its versatility across numerous domains.
+				target="_blank">游戏</a
+			>中，展示了其在众多领域的多功能性。
 		</p>
 		<p>
-			Fundamentally, text-generative Transformer models operate on the principle of <strong
-				>next-word prediction</strong
-			>: given a text prompt from the user, what is the <em>most probable next word</em> that will follow
-			this input? The core innovation and power of Transformers lie in their use of self-attention mechanism,
-			which allows them to process entire sequences and capture long-range dependencies more effectively
-			than previous architectures.
+			从根本上讲，文本生成 Transformer 模型的运行原理是<strong>下一个单词预测</strong>：给定用户的文本提示，
+			紧随此输入之后的<em>最有可能的下一个单词</em>是什么？Transformer 的核心创新和强大之处在于它们使用了
+			自注意力机制，这使得它们能够比以前的架构更有效地处理整个序列并捕获长距离依赖关系。
 		</p>
 		<p>
-			GPT-2 family of models are prominent examples of text-generative Transformers. Transformer
-			Explainer is powered by the
+			GPT-2 系列模型是文本生成 Transformers 的杰出代表。Transformer Explainer 基于
 			<a href="https://huggingface.co/openai-community/gpt2" title="Hugging Face" target="_blank"
 				>GPT-2</a
 			>
-			(small) model which has 124 million parameters. While it is not the latest or most powerful Transformer
-			model, it shares many of the same architectural components and principles found in the current
-			state-of-the-art models making it an ideal starting point for understanding the basics.
+			(small)，该模型有 1.24 亿个参数。虽然它不是最新或最强大的 Transformer 模型，
+			但它具有许多与当前最先进模型相同的架构组件和原理，使其成为理解基础知识的理想起点。
 		</p>
 	</div>
 
 	<div class="article-section">
-		<h1>Transformer Architecture</h1>
+		<h1>Transformer 架构</h1>
 
 		<p>
-			Every text-generative Transformer consists of these <strong>three key components</strong>:
+			每个文本生成 Transformer 都由以下<strong>三个关键组件</strong>组成：
 		</p>
 		<ol>
 			<li>
-				<strong class="bold-purple">Embedding</strong>: Text input is divided into smaller units
-				called tokens, which can be words or subwords. These tokens are converted into numerical
-				vectors called embeddings, which capture the semantic meaning of words.
+				<strong class="bold-purple">嵌入（Embedding）</strong>：文本输入被划分为更小的单位，
+				称为<strong>标记（token）</strong>，可以是单词或子单词。这些标记被转换成数值向量，称为<strong>嵌入（Embedding）</strong>，用于捕获单词的语义。
 			</li>
 			<li>
-				<strong class="bold-purple">Transformer Block</strong> is the fundamental building block of
-				the model that processes and transforms the input data. Each block includes:
+				<strong class="bold-purple">Transformer Block</strong> 是模型的基本构建块，用于处理和转换输入数据。
+				每个块包括：
 				<ul class="">
 					<li>
-						<strong>Attention Mechanism</strong>, the core component of the Transformer block. It
-						allows tokens to communicate with other tokens, capturing contextual information and
-						relationships between words.
+						<strong>注意力机制（Attention Mechanism）</strong>，Transformer 模块的核心组件。它允许
+						token 与其他 token 进行通信，从而捕获上下文信息和单词之间的关系。
 					</li>
 					<li>
-						<strong>MLP (Multilayer Perceptron) Layer</strong>, a feed-forward network that operates
-						on each token independently. While the goal of the attention layer is to route
-						information between tokens, the goal of the MLP is to refine each token's
-						representation.
+						<strong>MLP 层（多层感知器 Multilayer Perceptron）</strong>, 
+						一个独立对每个标记进行操作的前馈网络。注意层的目标是在标记之间路由
+						信息，而 MLP 的目标是优化每个标记的表示。
 					</li>
 				</ul>
 			</li>
 			<li>
-				<strong class="bold-purple">Output Probabilities</strong>: The final linear and softmax
-				layers transform the processed embeddings into probabilities, enabling the model to make
-				predictions about the next token in a sequence.
+				<strong class="bold-purple">输出概率（Output Probabilities）</strong>：
+				最后的线性层和 softmax 层将处理后的嵌入转换为概率，使模型能够对序列中的下一个标记做出预测。
 			</li>
 		</ol>
 
 		<div class="architecture-section" id="embedding">
-			<h2>Embedding</h2>
+			<h2>嵌入</h2>
 			<p>
-				Let's say you want to generate text using a Transformer model. You add the prompt like this
-				one: <code>“Data visualization empowers users to”</code>. This input needs to be converted
-				into a format that the model can understand and process. That is where embedding comes in:
-				it transforms the text into a numerical representation that the model can work with. To
-				convert a prompt into embedding, we need to 1) tokenize the input, 2) obtain token
-				embeddings, 3) add positional information, and finally 4) add up token and position
-				encodings to get the final embedding. Let’s see how each of these steps is done.
+				假设您想使用 Transformer 模型生成文本。您添加如下提示词（prompt）：<code>“Data visualization empowers users to”</code>。
+				此输入需要转换为模型可以理解和处理的格式。这就是嵌入的作用所在：它将文本转换为模型可以使用的数字表示。要将提示转换为嵌入，
+				我们需要 1) 对输入进行标记，2) 获取标记嵌入，3) 添加位置信息，最后 4) 将标记和位置编码相加以获得最终嵌入。
+				让我们看看每个步骤是如何完成的。
 			</p>
 			<div class="figure">
 				<img src="./article_assets/embedding.png" width="60%" height="60%" align="middle" />
 			</div>
 			<div class="figure-caption">
-				Figure <span class="attention">1</span>. Expanding the Embedding layer view, showing how the
-				input prompt is converted to a vector representation. The process involves
-				<span class="fig-numbering">(1)</span> Tokenization, (2) Token Embedding, (3) Positional Encoding,
-				and (4) Final Embedding.
+                图<span class="attention">1</span>，展开嵌入层视图，显示如何将输入提示转换为矢量表示。
+				该过程涉及 <span class="fig-numbering">(1)</span>标记化(2)标记嵌入(3)位置编码和(4)最终嵌入
 			</div>
 			<div class="article-subsection">
-				<h3>Step 1: Tokenization</h3>
+				<h3>步骤1：标记化</h3>
 				<p>
-					Tokenization is the process of breaking down the input text into smaller, more manageable
-					pieces called tokens. These tokens can be a word or a subword. The words <code
-						>"Data"</code
-					>
-					and <code>"visualization"</code> correspond to unique tokens, while the word
-					<code>"empowers"</code>
-					is split into two tokens. The full vocabulary of tokens is decided before training the model:
-					GPT-2's vocabulary has <code>50,257</code> unique tokens. Now that we split our input text
-					into tokens with distinct IDs, we can obtain their vector representation from embeddings.
+                    标记化（Tokenization）是将输入文本分解为更小、更易于管理的部分（称为标记）的过程。这些标记可以是单词或子单词。
+					单词 <code>"Data"</code> 和 <code>“visualization”</code> 对应于唯一标记，而单词 <code>“empowers”</code> 则
+					被拆分为两个标记。完整的标记词汇表是在训练模型之前确定的：GPT-2 的词汇表有 <code>50,257</code> 个唯一标记。
+					现在我们将输入文本拆分为具有不同 ID 的标记，我们可以从嵌入中获取它们的向量表示。
 				</p>
 			</div>
 			<div class="article-subsection" id="article-token-embedding">
-				<h3>Step 2. Token Embedding</h3>
+				<h3>步骤2：Token 嵌入</h3>
 				<p>
-					GPT-2 Small represents each token in the vocabulary as a 768-dimensional vector; the
-					dimension of the vector depends on the model. These embedding vectors are stored in a
-					matrix of shape <code>(50,257, 768)</code>, containing approximately 39 million
-					parameters! This extensive matrix allows the model to assign semantic meaning to each
-					token.
+					GPT-2 Small 将词汇表中的每个标记表示为一个 768 维向量；向量的维度取决于模型。这些嵌入向量存储在形状为
+					<code>(50,257, 768)</code> 的矩阵中，包含大约 3900 万个参数！这个广泛的矩阵允许模型为每个标记分配语义含义。
 				</p>
 			</div>
 			<div class="article-subsection" id="article-positional-embedding">
-				<h3>Step 3. Positional Encoding</h3>
+				<h3>步骤3：位置编码</h3>
 				<p>
-					The Embedding layer also encodes information about each token's position in the input
-					prompt. Different models use various methods for positional encoding. GPT-2 trains its own
-					positional encoding matrix from scratch, integrating it directly into the training
-					process.
+					Embedding 层还对每个 token 在输入提示中的位置信息进行编码。不同的模型使用不同的方法进行位置编码。
+					GPT-2 从头开始​​训练自己的位置编码矩阵，将其直接集成到训练过程中。
 				</p>
 
 				<!-- <div class="article-subsection-l2">
-            <h4>Alternative Positional Encoding Approach <strong class='attention'>[POTENTIALLY COLLAPSIBLE]</strong></h4>
+            <h4>替代位置编码方法<strong class='attention'>[可能折叠]</strong></h4>
             <p>
-              Other models, like the original Transformer and BERT,
-              use sinusoidal functions for positional encoding.
-
-              This sinusoidal encoding is deterministic and designed to reflect
-              the absolute as well as the relative position of each token.
+                其他模型（如原始 Transformer 和 BERT）使用正弦函数进行位置编码。
+			    这种正弦编码是确定性的，旨在反映每个 token 的绝对位置和相对位置。
             </p>
             <p>
-              Each position in a sequence is assigned a unique mathematical
-              representation using a combination of sine and cosine functions.
+				使用正弦和余弦函数的组合为序列中的每个位置分配一个唯一的数学表示。
 
-              For a given position, the sine function represents even dimensions,
-              and the cosine function represents odd dimensions within the positional encoding vector.
+				对于给定位置，正弦函数表示位置编码向量中的偶数维度，余弦函数表示奇数维度。
 
-              This periodic nature ensures that each position has a consistent encoding,
-              independent of the surrounding context.
+				这种周期性确保每个位置都有一致的编码，与周围环境无关。
             </p>
 
             <p>
-              Here’s how it works:
+                工作原理如下：
             </p>
 
             <span class='attention'>
-              SINUSOIDAL POSITIONAL ENCODING EQUATION
+              正弦位置编码方程
             </span>
 
             <ul>
               <li>
-                <strong>Sine Function</strong>: Used for even indices of the embedding vector.
+                <strong>正弦函数</strong>：用于嵌入向量的偶数索引。
               </li>
               <li>
-                <strong>Cosine Function</strong>: Used for odd indices of the embedding vector.
+                <strong>余弦函数</strong>：用于嵌入向量的奇数索引。
             </ul>
 
             <p>
-              Hover over individual encoding values in the matrix above to
-              see how it's calculated using the sins and cosine functions.
+              将鼠标悬停在上面矩阵中的各个编码值上，以查看如何使用正弦和余弦函数计算它。
             </p>
           </div> -->
 			</div>
 			<div class="article-subsection">
-				<h3>Step 4. Final Embedding</h3>
+				<h3>步骤4：最终嵌入</h3>
 				<p>
-					Finally, we sum the token and positional encodings to get the final embedding
-					representation. This combined representation captures both the semantic meaning of the
-					tokens and their position in the input sequence.
+					最后，我们将标记和位置编码相加以获得最终的嵌入表示。这种组合表示既捕获了标记的语义含义，也捕获了它们在输入序列中的位置。
 				</p>
 			</div>
 		</div>
 
 		<div class="architecture-section">
-			<h2>Transformer Block</h2>
+			<h2>Transformer 块</h2>
 
 			<p>
-				The core of the Transformer's processing lies in the Transformer block, which comprises
-				multi-head self-attention and a Multi-Layer Perceptron layer. Most models consist of
-				multiple such blocks that are stacked sequentially one after the other. The token
-				representations evolve through layers, from the first block to the 12th one, allowing the
-				model to build up an intricate understanding of each token. This layered approach leads to
-				higher-order representations of the input.
+				Transformer 处理的核心在于 Transformer 块，它由多头自注意力和多层感知器层组成。大多数模型由多个这样的块组成，
+				这些块按顺序一个接一个地堆叠在一起。Token 表示通过层级演变，从第一个块到第 12 个块，使模型能够对每个 Token 建立复杂的理解。
+				这种分层方法可以实现输入的高阶表示。
 			</p>
 
 			<div class="article-subsection" id="self-attention">
-				<h3>Multi-Head Self-Attention</h3>
+				<h3>多头自注意力</h3>
 				<p>
-					The self-attention mechanism enables the model to focus on relevant parts of the input
-					sequence, allowing it to capture complex relationships and dependencies within the data.
-					Let’s look at how this self-attention is computed step-by-step.
+					自注意力机制使模型能够专注于输入序列的相关部分，从而能够捕获数据中的复杂关系和依赖关系。
+					让我们一步步看看这种自注意力是如何计算的。
 				</p>
 				<div class="article-subsection-l2">
-					<h4>Step 1: Query, Key, and Value Matrices</h4>
+					<h4>第一步：查询、键和值矩阵（Query, Key, and Value Matrices）</h4>
 
 					<div class="figure">
 						<img src="./article_assets/QKV.png" width="80%" align="middle" />
 					</div>
 					<div class="figure-caption">
-						Figure <span class="attention">2</span>. Computing Query, Key, and Value matrices from
-						the original embedding.
+						图<span class="attention">2</span>，根据原始嵌入计算查询、键和值矩阵
 					</div>
 
 					<p>
-						Each token's embedding vector is transformed into three vectors:
-						<span class="q-color">Query (Q)</span>,
-						<span class="k-color">Key (K)</span>, and
-						<span class="v-color">Value (V)</span>. These vectors are derived by multiplying the
-						input embedding matrix with learned weight matrices for
-						<span class="q-color">Q</span>,
-						<span class="k-color">K</span>, and
-						<span class="v-color">V</span>. Here's a web search analogy to help us build some
-						intuition behind these matrices:
+						每个 token 的嵌入向量被转换成三个向量：
+						<span class="q-color">Query (Q)</span>、
+						<span class="k-color">Key (K)</span>和
+						<span class="v-color">Value (V)</span>。这些向量是通过将输入嵌入矩阵与学习到的权重矩阵相乘而得出的
+						<span class="q-color">Q</span>、
+						<span class="k-color">K</span>和
+						<span class="v-color">V</span>。这里有一个网络搜索类比，可以帮助我们建立这些矩阵背后的一些直觉：
 					</p>
 					<ul>
 						<li>
-							<strong class="q-color font-medium">Query (Q)</strong> is the search text you type in
-							the search engine bar. This is the token you want to
-							<em>"find more information about"</em>.
+							<strong class="q-color font-medium">Query (Q)</strong> 是您在搜索引擎栏中输入的搜索文本。
+							这是您想要<em>“查找更多信息”</em>的标记。
 						</li>
 						<li>
-							<strong class="k-color font-medium">Key (K)</strong> is the title of each web page in the
-							search result window. It represents the possible tokens the query can attend to.
+							<strong class="k-color font-medium">Key (K)</strong> 是搜索结果窗口中每个网页的标题。
+							它表示查询可以关注的可能的标记。
 						</li>
 						<li>
-							<strong class="v-color font-medium">Value (V)</strong> is the actual content of web pages
-							shown. Once we matched the appropriate search term (Query) with the relevant results (Key),
-							we want to get the content (Value) of the most relevant pages.
+							<strong class="v-color font-medium">Value (V)</strong>是网页显示的实际内容。
+							当我们将适当的搜索词（Query）与相关结果（Key）匹配后，我们希望获得最相关页面的内容（Value）。
 						</li>
 					</ul>
 					<p>
-						By using these QKV values, the model can calculate attention scores, which determine how
-						much focus each token should receive when generating predictions.
+						通过使用这些 QKV 值，模型可以计算注意力分数，这决定了每个标记在生成预测时应该获得多少关注。
 					</p>
 				</div>
 				<div class="article-subsection-l2">
-					<h4>Step 2: Masked Self-Attention</h4>
+					<h4>第二步：掩码自注意力机制</h4>
 					<p>
-						Masked self-attention allows the model to generate sequences by focusing on relevant
-						parts of the input while preventing access to future tokens.
+						掩码自注意力机制（Masked Self-Attention）允许模型通过关注输入的相关部分来生成序列，同时阻止访问未来的标记。
 					</p>
 
 					<div class="figure">
 						<img src="./article_assets/attention.png" width="80%" align="middle" />
 					</div>
 					<div class="figure-caption">
-						Figure <span class="attention">3</span>. Using Query, Key, and Value matrices to
-						calculate masked self-attention.
+						图<span class="attention">3</span>，使用查询、键和值矩阵来计算掩蔽自注意力
 					</div>
 
 					<ul>
 						<li>
-							<strong>Attention Score</strong>: The dot product of
-							<span class="q-color">Query</span>
-							and <span class="k-color">Key</span> matrices determines the alignment of each query with
-							each key, producing a square matrix that reflects the relationship between all input tokens.
+							<strong>注意力分数</strong>：<span class="q-color">Query</span>和<span class="k-color">Key</span>
+							矩阵的点积确定每个查询与每个键的对齐方式，从而产生一个反映所有输入标记之间关系的方阵。
 						</li>
 						<li>
-							<strong>Masking</strong>: A mask is applied to the upper triangle of the attention
-							matrix to prevent the model from accessing future tokens, setting these values to
-							negative infinity. The model needs to learn how to predict the next token without
-							“peeking” into the future.
+							<strong>掩码</strong>：对注意力矩阵的上三角应用掩码，以防止模型访问未来的标记，并将这些值设置为负无穷大。
+							模型需要学习如何在不“窥视”未来的情况下预测下一个标记。
 						</li>
 						<li>
-							<strong>Softmax</strong>: After masking, the attention score is converted into
-							probability by the softmax operation which takes the exponent of each attention score.
-							Each row of the matrix sums up to one and indicates the relevance of every other token
-							to the left of it.
+							<strong>Softmax</strong>：经过掩码处理后，注意力得分通过 softmax 运算转换为概率，该运算取每个注意
+							力得分的指数。矩阵的每一行总和为 1，并表示其左侧每个其他标记的相关性。
 						</li>
 					</ul>
 				</div>
 				<div class="article-subsection-l2">
-					<h4>Step 3: Output</h4>
+					<h4>第三步：输出</h4>
 					<p>
-						The model uses the masked self-attention scores and multiplies them with the
-						<span class="v-color">Value</span> matrix to get the
-						<span class="purple-color">final output</span>
-						of the self-attention mechanism. GPT-2 has <code>12</code> self-attention heads, each capturing
-						different relationships between tokens. The outputs of these heads are concatenated and passed
-						through a linear projection.
+						该模型使用掩码后的自注意力得分，并将其与 <span class="v-color">Value</span> 矩阵相乘，
+						以获得自注意力机制的 <span class="purple-color">最终输出</span>。GPT-2 有 <code>12</code> 个
+						自注意力 heads，每个 head 捕获 token 之间的不同关系。这些 head 的输出被连接起来并通过线性投影（linear projection）。
 					</p>
 				</div>
 
 				<div class="article-subsection" id="article-activation">
-					<h3>MLP: Multi-Layer Perceptron</h3>
+					<h3>多层感知器</h3>
 
 					<div class="figure">
 						<img src="./article_assets/mlp.png" width="70%" align="middle" />
 					</div>
 					<div class="figure-caption">
-						Figure <span class="attention">4</span>. Using MLP layer to project the self-attention
-						representations into higher dimensions to enhance the model's representational capacity.
+						图<span class="attention">4</span>，使用 MLP 层将自注意力表征投影到更高维度，以增强模型的表征能力
 					</div>
 
 					<p>
-						After the multiple heads of self-attention capture the diverse relationships between the
-						input tokens, the concatenated outputs are passed through the Multilayer Perceptron
-						(MLP) layer to enhance the model's representational capacity. The MLP block consists of
-						two linear transformations with a GELU activation function in between. The first linear
-						transformation increases the dimensionality of the input four-fold from <code>768</code>
-						to <code>3072</code>. The second linear transformation reduces the dimensionality back
-						to the original size of <code>768</code>, ensuring that the subsequent layers receive
-						inputs of consistent dimensions. Unlike the self-attention mechanism, the MLP processes
-						tokens independently and simply map them from one representation to another.
+						在多个自注意力机制捕获输入 token 之间的不同关系后，连接的输出将通过多层感知器（MLP，Multi-Layer Perceptron）层，
+						以增强模型的表示能力。MLP 块由两个线性变换组成，中间有一个 GELU 激活函数。
+						第一个线性变换将输入的维数从 <code>768</code> 增加了四倍至 <code>3072</code>。
+						第二个线性变换将维数降低回原始大小 <code>768</code>，确保后续层接收一致维度的输入。
+						与自注意力机制不同，MLP 独立处理 token 并简单地将它们从一种表示映射到另一种表示。
 					</p>
 				</div>
 
 				<div class="architecture-section" id="article-prob">
-					<h2>Output Probabilities</h2>
+					<h2>输出概率</h2>
 					<p>
-						After the input has been processed through all Transformer blocks, the output is passed
-						through the final linear layer to prepare it for token prediction. This layer projects
-						the final representations into a <code>50,257</code>
-						dimensional space, where every token in the vocabulary has a corresponding value called
-						<code>logit</code>. Any token can be the next word, so this process allows us to simply
-						rank these tokens by their likelihood of being that next word. We then apply the softmax
-						function to convert the logits into a probability distribution that sums to one. This
-						will allow us to sample the next token based on its likelihood.
+						在输入经过所有 Transformer 块处理后，输出将通过最后的线性层，为标记预测做好准备。
+						此层将最终表示投影到 <code>50,257</code> 维空间中，词汇表中的每个标记都有一个对应的值，
+						称为 <code>logit</code>。任何标记都可以是下一个单词，因此此过程允许我们根据它们成为
+						下一个单词的可能性对这些标记进行简单排序。然后，我们应用 softmax 函数将 logit 转换为
+						总和为 1 的概率分布。这将使我们能够根据其可能性对下一个标记进行采样。
 					</p>
 
 					<div class="figure">
 						<img src="./article_assets/softmax.png" width="60%" align="middle" />
 					</div>
 					<div class="figure-caption">
-						Figure <span class="attention">5</span>. Each token in the vocabulary is assigned a
-						probability based on the model's output logits. These probabilities determine the
-						likelihood of each token being the next word in the sequence.
+						图<span class="attention">5</span>，词汇表中的每个标记都根据模型的输出逻辑
+						分配一个概率，这些概率决定了每个标记成为序列中下一个单词的可能性
 					</div>
 
 					<p id="article-temperature">
-						The final step is to generate the next token by sampling from this distribution The <code
-							>temperature</code
-						>
-						hyperparameter plays a critical role in this process. Mathematically speaking, it is a very
-						simple operation: model output logits are simply divided by the
-						<code>temperature</code>:
+						最后一步是从该分布中采样来生成下一个标记。<code>temperature</code> 超参数在
+						此过程中起着关键作用。从数学上讲，这是一个非常简单的操作：模型输出 logits 只
+						需除以 <code>temperature</code>：
 					</p>
 
 					<ul>
 						<li>
-							<code>temperature = 1</code>: Dividing logits by one has no effect on the softmax
-							outputs.
+							<code>temperature = 1</code>：将 logits 除以 1 对 softmax 输出没有影响。
 						</li>
 						<li>
-							<code>temperature &lt; 1</code>: Lower temperature makes the model more confident and
-							deterministic by sharpening the probability distribution, leading to more predictable
-							outputs.
+							<code>temperature &lt; 1</code>：较低的温度通过锐化概率分布使模型更加自信和确定，从而产生更可预测的输出。
 						</li>
 						<li>
-							<code>temperature &gt; 1</code>: Higher temperature creates a softer probability
-							distribution, allowing for more randomness in the generated text – what some refer to
-							as model <em>“creativity”</em>.
+							<code>temperature &gt; 1</code>：较高的温度会产生更柔和的概率分布，从而允许生成的文本具有更多的随机性 - 有些人称之为模型<em>“创造力”</em>。
 						</li>
 					</ul>
 
 					<p>
-						Adjust the temperature and see how you can balance between deterministic and diverse
-						outputs!
+						调节温度，看看如何在确定性和多样化输出之间取得平衡！
 					</p>
 				</div>
 
 				<div class="architecture-section">
-					<h2>Advanced Architectural Features</h2>
+					<h2>高级架构功能</h2>
 
 					<p>
-						There are several advanced architectural features that enhance the performance of
-						Transformer models. While important for the model's overall performance, they are not as
-						important for understanding the core concepts of the architecture. Layer Normalization,
-						Dropout, and Residual Connections are crucial components in Transformer models,
-						particularly during the training phase. Layer Normalization stabilizes training and
-						helps the model converge faster. Dropout prevents overfitting by randomly deactivating
-						neurons. Residual Connections allows gradients to flow directly through the network and
-						helps to prevent the vanishing gradient problem.
+						有几种高级架构功能可增强 Transformer 模型的性能。虽然它们对于模型的整体性能很重要，
+						但对于理解架构的核心概念却不那么重要。层规范化、Dropout 和残差连接是 Transformer 
+						模型中的关键组件，尤其是在训练阶段。层规范化可以稳定训练并帮助模型更快地收敛。
+						Dropout 通过随机停用神经元来防止过度拟合。残差连接允许梯度直接流过网络并有助于防止梯度消失问题。
 					</p>
 					<div class="article-subsection" id="article-ln">
-						<h3>Layer Normalization</h3>
+						<h3>层归一化</h3>
 
 						<p>
-							Layer Normalization helps to stabilize the training process and improves convergence.
-							It works by normalizing the inputs across the features, ensuring that the mean and
-							variance of the activations are consistent. This normalization helps mitigate issues
-							related to internal covariate shift, allowing the model to learn more effectively and
-							reducing the sensitivity to the initial weights. Layer Normalization is applied twice
-							in each Transformer block, once before the self-attention mechanism and once before
-							the MLP layer.
+							层归一化（Layer Normalization）有助于稳定训练过程并提高收敛性。它通过对特征之间的输入进行归一化来工作，
+							确保激活的均值和方差一致。这种归一化有助于缓解与内部协变量偏移相关的问题，
+							使模型能够更有效地学习并降低对初始权重的敏感度。每个 Transformer 块中都会
+							应用两次层归一化，一次在自注意力机制之前，一次在 MLP 层之前。
 						</p>
 					</div>
 					<div class="article-subsection" id="article-dropout">
-						<h3>Dropout</h3>
+						<h3>暂退法</h3>
 
 						<p>
-							Dropout is a regularization technique used to prevent overfitting in neural networks
-							by randomly setting a fraction of model weights to zero during training. This
-							encourages the model to learn more robust features and reduces dependency on specific
-							neurons, helping the network generalize better to new, unseen data. During model
-							inference, dropout is deactivated. This essentially means that we are using an
-							ensemble of the trained subnetworks, which leads to a better model performance.
+							暂退法（Dropout）是一种正则化技术，通过在训练期间随机将模型权重的一部分设置为零来防止神经网络过度拟合。
+							这鼓励模型学习更稳健的特征并减少对特定神经元的依赖，帮助网络更好地推广到新的、未见过的数据。
+							在模型推理期间，Dropout 被停用。这本质上意味着我们正在使用经过训练的子网络的集合，从而提高模型性能。
 						</p>
 					</div>
 					<div class="article-subsection" id="article-residual">
-						<h3>Residual Connections</h3>
-
+						<h3>残差连接</h3>
 						<p>
-							Residual connections were first introduced in the ResNet model in 2015. This
-							architectural innovation revolutionized deep learning by enabling the training of very
-							deep neural networks. Essentially, residual connections are shortcuts that bypass one
-							or more layers, adding the input of a layer to its output. This helps mitigate the
-							vanishing gradient problem, making it easier to train deep networks with multiple
-							Transformer blocks stacked on top of each other. In GPT-2, residual connections are
-							used twice within each Transformer block: once before the MLP and once after, ensuring
-							that gradients flow more easily, and earlier layers receive sufficient updates during
-							backpropagation.
+							残差连接（Residual Connections）于 2015 年首次在 ResNet 模型中引入。这种架构创新通过实现非常深的神经网络的训练，
+							彻底改变了深度学习。本质上，残差连接是绕过一个或多个层的捷径，将层的输入添加到其输出中。
+							这有助于缓解梯度消失问题，从而更容易训练堆叠在一起的多个 Transformer 块的深度网络。
+							在 GPT-2 中，每个 Transformer 块内使用两次残差连接：一次在 MLP 之前，一次在 MLP 之后，
+							以确保梯度更容易流动，并且较早的层在反向传播期间获得足够的更新。
 						</p>
 					</div>
 				</div>
 
 				<div class="article-section">
-					<h1>Interactive Features</h1>
+					<h1>互动功能</h1>
 					<p>
-						Transformer Explainer is built to be interactive and allows you to explore the inner
-						workings of the Transformer. Here are some of the interactive features you can play
-						with:
+						Transformer Explainer 是交互式的，可让您探索 Transformer 的内部工作原理。以下是您可以使用的一些交互式功能：
 					</p>
 
 					<ul>
 						<li>
-							<strong>Input your own text sequence</strong> to see how the model processes it and predicts
-							the next word. Explore attention weights, intermediate computations, and see how the final
-							output probabilities are calculated.
+							<strong>输入您自己的文本序列</strong>，看看模型如何处理它并预测下一个单词。探索注意力权重、中间计算，
+							并看看最终输出概率是如何计算的。
 						</li>
 						<li>
-							<strong>Use the temperature slider</strong> to control the randomness of the model’s predictions.
-							Explore how you can make the model output more deterministic or more creative by changing
-							the temperature value.
+							<strong>使用温度滑块</strong>控制模型预测的随机性。探索如何通过更改温度值使模型输出更具确定性或更具创造性。
 						</li>
 						<li>
-							<strong>Interact with attention maps</strong> to see how the model focuses on different
-							tokens in the input sequence. Hover over tokens to highlight their attention weights and
-							explore how the model captures context and relationships between words.
+							<strong>与注意力图交互</strong>，查看模型如何关注输入序列中的不同标记。将鼠标悬停在标记上
+							以突出显示其注意力权重，并探索模型如何捕获上下文和单词之间的关系。
 						</li>
 					</ul>
 				</div>
 
 				<div class="article-section">
-					<h2>Video Tutorial</h2>
+					<h2>视频教程</h2>
 					<div class="video-container">
+						<!-- svelte-ignore a11y-missing-attribute -->
 						<iframe
 							src="https://www.youtube.com/embed/ECR4oAwocjs"
 							frameborder="0"
@@ -492,41 +400,37 @@
 				</div>
 
 				<div class="article-section">
-					<h2>How is Transformer Explainer Implemented?</h2>
+					<h2>Transformer Explainer 是如何构建的？</h2>
 					<p>
-						Transformer Explainer features a live GPT-2 (small) model running directly in the
-						browser. This model is derived from the PyTorch implementation of GPT by Andrej
-						Karpathy's
+					    Transformer Explainer 具有一个可直接在浏览器中运行的实时 GPT-2（小型）模型。
+						该模型源自 Andrej Karpathy 的 
 						<a href="https://github.com/karpathy/nanoGPT" title="Github" target="_blank"
-							>nanoGPT project</a
+							>nanoGPT 项目</a
 						>
-						and has been converted to
+						PyTorch GPT 实现，并已转换为
 						<a href="https://onnxruntime.ai/" title="ONNX" target="_blank">ONNX Runtime</a>
-						for seamless in-browser execution. The interface is built using JavaScript, with
+						实现浏览器内无缝执行。该界面使用 JavaScript 构建，借助
 						<a href="https://kit.svelte.dev/" title="Svelte" target="_blank">Svelte</a>
-						as a front-end framework and
+						作为前端框架以及使用
 						<a href="https://d3js.org/" title="D3" target="_blank">D3.js</a>
-						for creating dynamic visualizations. Numerical values are updated live following the user
-						input.
+						创建动态可视化。数值根据用户输入实时更新。
 					</p>
 				</div>
 
 				<div class="article-section">
-					<h2>Who developed the Transformer Explainer?</h2>
+					<h2>谁开发了 Transformer Explainer？</h2>
 					<p>
-						Transformer Explainer was created by
+						Transformer Explainer 的作者包括
 
-						<a href="https://aereeeee.github.io/" target="_blank">Aeree Cho</a>,
+						<a href="https://aereeeee.github.io/" target="_blank">Aeree Cho</a>，
 						<a href="https://www.linkedin.com/in/chaeyeonggracekim/" target="_blank">Grace C. Kim</a
-						>,
-						<a href="https://alexkarpekov.com/" target="_blank">Alexander Karpekov</a>,
-						<a href="https://alechelbling.com/" target="_blank">Alec Helbling</a>,
-						<a href="https://zijie.wang/" target="_blank">Jay Wang</a>,
-						<a href="https://seongmin.xyz/" target="_blank">Seongmin Lee</a>,
-						<a href="https://bhoov.com/" target="_blank">Benjamin Hoover</a>, and
-						<a href="https://poloclub.github.io/polochau/" target="_blank">Polo Chau</a>
-
-						at the Georgia Institute of Technology.
+						>，
+						<a href="https://alexkarpekov.com/" target="_blank">Alexander Karpekov</a>，
+						<a href="https://alechelbling.com/" target="_blank">Alec Helbling</a>，
+						<a href="https://zijie.wang/" target="_blank">Jay Wang</a>，
+						<a href="https://seongmin.xyz/" target="_blank">Seongmin Lee</a>，
+						<a href="https://bhoov.com/" target="_blank">Benjamin Hoover</a>，以及佐治亚理工学院的
+						<a href="https://poloclub.github.io/polochau/" target="_blank">Polo Chau</a>。
 					</p>
 				</div>
 			</div>