pass-lin
diff --git a/‎bert4keras3/__pycache__/backend.cpython-310.pyc
1.49 KB b/‎bert4keras3/__pycache__/backend.cpython-310.pyc
1.49 KB
diff --git a/‎bert4keras3/__pycache__/backend.cpython-37.pyc
1.67 KB b/‎bert4keras3/__pycache__/backend.cpython-37.pyc
1.67 KB
diff --git a/‎bert4keras3/__pycache__/layers.cpython-310.pyc
0 Bytes b/‎bert4keras3/__pycache__/layers.cpython-310.pyc
0 Bytes
diff --git a/‎bert4keras3/__pycache__/layers.cpython-37.pyc
0 Bytes b/‎bert4keras3/__pycache__/layers.cpython-37.pyc
0 Bytes
diff --git a/‎bert4keras3/__pycache__/ops.cpython-37.pyc
-9 Bytes b/‎bert4keras3/__pycache__/ops.cpython-37.pyc
-9 Bytes
diff --git a/‎bert4keras3/__pycache__/optimizers.cpython-310.pyc
48.8 KB b/‎bert4keras3/__pycache__/optimizers.cpython-310.pyc
48.8 KB
diff --git a/‎bert4keras3/__pycache__/optimizers.cpython-37.pyc
15.6 KB b/‎bert4keras3/__pycache__/optimizers.cpython-37.pyc
15.6 KB
diff --git a/‎bert4keras3/__pycache__/tokenizers.cpython-310.pyc
0 Bytes b/‎bert4keras3/__pycache__/tokenizers.cpython-310.pyc
0 Bytes
diff --git a/‎bert4keras3/__pycache__/tokenizers.cpython-37.pyc
0 Bytes b/‎bert4keras3/__pycache__/tokenizers.cpython-37.pyc
0 Bytes
diff --git a/‎bert4keras3/backend.py
+108-74 b/‎bert4keras3/backend.py
+108-74
diff --git a/‎bert4keras3/layers.py
+3 b/‎bert4keras3/layers.py
+3
diff --git a/‎examples/basic_T5PEGASUS_test.py
+65 b/‎examples/basic_T5PEGASUS_test.py
+65
@@ -9,40 +9,72 @@
 is_tf_keras = strtobool(os.environ.get('TF_KERAS', '0'))
 os.environ["KERAS_BACKEND"]=os.environ.get("KERAS_BACKEND", 'tensorflow')
 backlib=os.environ["KERAS_BACKEND"]
-if backlib=='torch':
+if backlib=='tfkeras':
+    is_tf_keras = True
+elif backlib=='torch':
     import torch
 elif backlib=='jax':
     import jax  
 if is_tf_keras:
     sys.modules['keras'] = tf.keras
+
 import keras
 import keras.backend as K
+do_recompute = strtobool(os.environ.get('RECOMPUTE', '0'))
+use_keras_2 = is_tf_keras or keras.__version__<'3.0'
 
-
-
-if keras.__version__<'3.0':
+if use_keras_2:
 
     from tensorflow.python.client import device_lib
     from tensorflow.python.util import nest, tf_inspect
     from tensorflow.python.eager import tape
     from tensorflow.python.ops.custom_gradient import _graph_mode_decorator
     import bert4keras3.ops as ops
     load_variable=tf.train.load_variable
-    
+    norm=tf.norm
 else:
     from keras import ops
-    if backlib==torch:
+    
+    if backlib=='torch':
+        from torch.utils.checkpoint import checkpoint
         def norm(tensor, ord='euclidean', axis=None, keepdims=None):
             if ord=='euclidean':
                 ord=None
             return torch.linalg.norm(tensor, ord, axis, keepdims)
+        def recompute_grad(call):
+            if not do_recompute:
+                return call
+        
+            def inner(self, inputs, **kwargs):
+                """定义需要求梯度的函数以及重新定义求梯度过程
+                （参考自官方自带的tf.recompute_grad函数）
+                """
+                flat_inputs = nest.flatten(inputs)
+                call_args = tf_inspect.getfullargspec(call).args
+                for key in ['mask', 'training']:
+                    if key not in call_args and key in kwargs:
+                        del kwargs[key]
+                def kernel_call():
+                    return call(self, inputs, **kwargs)
+                return checkpoint(kernel_call,inputs, **kwargs)
+        
+            return inner
     elif backlib=='jax':
+        import jax
+        def recompute_grad(call):
+            if not do_recompute:
+                return call
+            return jax.checkpoint(call)
         def norm(tensor, ord='euclidean', axis=None, keepdims=None):
             if ord=='euclidean':
                 ord=None
             return jax.numpy.linalg.norm(tensor, ord, axis, keepdims)
 
     else:
+        def recompute_grad(call):
+            if not do_recompute:
+                return call
+            return tf.recompute_grad(call)
         norm=tf.norm
 ops.norm=norm
 # 判断是否启用重计算（通过时间换空间）
@@ -171,7 +203,7 @@ def dtype(x):
         pass
 K.dtype=dtype
 
-if keras.__version__<'3.0':
+if use_keras_2:
     def where(cond, x, y):
         """给tf.where加上自动广播
         """
@@ -207,9 +239,10 @@ def sequence_masking(
     bias: 额外的偏置项，或者附加的mask；
     return_mask: 是否同时返回对齐后的mask。
     """
+    
     if not (mask is None and bias is None):
         if mask is None:
-            if K.dtype(bias) == 'bool':
+            if K.dtype(bias) == 'bool' or (backlib=='torch' and K.dtype(bias) == torch.bool):
                 mask = bias
                 x = ops.where(mask, x, value)
             else:
@@ -226,6 +259,8 @@ def sequence_masking(
 
             if K.dtype(mask) != 'bool':
                 mask = ops.cast(mask, 'bool')
+            elif backlib=='torch' and K.dtype(bias) == torch.bool:
+                mask = ops.cast(mask, torch.bool)
 
             full_mask = align(mask, [0, axes[0]], K.ndim(x))
             for axis in axes[1:]:
@@ -234,7 +269,7 @@ def sequence_masking(
             mask = full_mask
             if bias is None:
                 x = ops.where(mask, x, value)
-            elif K.dtype(bias) == 'bool':
+            elif K.dtype(bias) == 'bool' or (backlib=='torch' and K.dtype(bias) == torch.bool):
                 mask = mask & bias
                 x = ops.where(mask, x, value)
             else:
@@ -280,7 +315,7 @@ def attention_normalize(a, mask=None, axis=-1, method='softmax', bias=None):
     softmax_plus：来自 https://kexue.fm/archives/8823 。
     """
     a, mask = sequence_masking(a, mask, -np.inf, axis, bias, True)
-    if method == 'softmax':
+    if method == 'softmax' :
         return ops.softmax(a, axis=axis)
     else:
         if mask is None:
@@ -433,80 +468,79 @@ def graph_mode_decorator(f, *args, **kwargs):
         else:
             return _graph_mode_decorator(f, args, kwargs)
 
-
-def recompute_grad(call):
-    """重计算装饰器（用来装饰Keras层的call函数）
-    关于重计算，请参考：https://arxiv.org/abs/1604.06174
-    """
-    if not do_recompute:
-        return call
-
-    def inner(self, inputs, **kwargs):
-        """定义需要求梯度的函数以及重新定义求梯度过程
-        （参考自官方自带的tf.recompute_grad函数）
+    def recompute_grad(call):
+        """重计算装饰器（用来装饰Keras层的call函数）
+        关于重计算，请参考：https://arxiv.org/abs/1604.06174
         """
-        flat_inputs = nest.flatten(inputs)
-        call_args = tf_inspect.getfullargspec(call).args
-        for key in ['mask', 'training']:
-            if key not in call_args and key in kwargs:
-                del kwargs[key]
-
-        def kernel_call():
-            """定义前向计算
-            """
-            return call(self, inputs, **kwargs)
-
-        def call_and_grad(*inputs):
-            """定义前向计算和反向计算
+        if not do_recompute:
+            return call
+    
+        def inner(self, inputs, **kwargs):
+            """定义需要求梯度的函数以及重新定义求梯度过程
+            （参考自官方自带的tf.recompute_grad函数）
             """
-            if is_tf_keras:
-                with tape.stop_recording():
-                    outputs = kernel_call()
-                    outputs = tf.identity(outputs)
-            else:
-                outputs = kernel_call()
-
-            def grad_fn(doutputs, variables=None):
-                watches = list(inputs)
-                if variables is not None:
-                    watches += list(variables)
-                with tf.GradientTape() as t:
-                    t.watch(watches)
-                    with tf.control_dependencies([doutputs]):
+            flat_inputs = nest.flatten(inputs)
+            call_args = tf_inspect.getfullargspec(call).args
+            for key in ['mask', 'training']:
+                if key not in call_args and key in kwargs:
+                    del kwargs[key]
+    
+            def kernel_call():
+                """定义前向计算
+                """
+                return call(self, inputs, **kwargs)
+    
+            def call_and_grad(*inputs):
+                """定义前向计算和反向计算
+                """
+                if is_tf_keras:
+                    with tape.stop_recording():
                         outputs = kernel_call()
-                grads = t.gradient(
-                    outputs, watches, output_gradients=[doutputs]
+                        outputs = tf.identity(outputs)
+                else:
+                    outputs = kernel_call()
+    
+                def grad_fn(doutputs, variables=None):
+                    watches = list(inputs)
+                    if variables is not None:
+                        watches += list(variables)
+                    with tf.GradientTape() as t:
+                        t.watch(watches)
+                        with tf.control_dependencies([doutputs]):
+                            outputs = kernel_call()
+                    grads = t.gradient(
+                        outputs, watches, output_gradients=[doutputs]
+                    )
+                    del t
+                    return grads[:len(inputs)], grads[len(inputs):]
+    
+                return outputs, grad_fn
+    
+            if is_tf_keras:  # 仅在tf >= 2.0下可用
+                outputs, grad_fn = call_and_grad(*flat_inputs)
+                flat_outputs = nest.flatten(outputs)
+    
+                def actual_grad_fn(*doutputs):
+                    grads = grad_fn(*doutputs, variables=self.trainable_weights)
+                    return grads[0] + grads[1]
+    
+                watches = flat_inputs + self.trainable_weights
+                watches = [tf.convert_to_tensor(x) for x in watches]
+                tape.record_operation(
+                    call.__name__, flat_outputs, watches, actual_grad_fn
                 )
-                del t
-                return grads[:len(inputs)], grads[len(inputs):]
-
-            return outputs, grad_fn
-
-        if is_tf_keras:  # 仅在tf >= 2.0下可用
-            outputs, grad_fn = call_and_grad(*flat_inputs)
-            flat_outputs = nest.flatten(outputs)
-
-            def actual_grad_fn(*doutputs):
-                grads = grad_fn(*doutputs, variables=self.trainable_weights)
-                return grads[0] + grads[1]
-
-            watches = flat_inputs + self.trainable_weights
-            watches = [tf.convert_to_tensor(x) for x in watches]
-            tape.record_operation(
-                call.__name__, flat_outputs, watches, actual_grad_fn
-            )
-            return outputs
-        else:  # keras + tf >= 1.14 均可用
-            return graph_mode_decorator(call_and_grad, *flat_inputs)
-
-    return inner
+                return outputs
+            else:  # keras + tf >= 1.14 均可用
+                return graph_mode_decorator(call_and_grad, *flat_inputs)
+    
+        return inner
 
 
 ops.reshape = reshape
 ops.flatten = flatten
 
 
-if keras.__version__<'3.0':
+if use_keras_2:
 
     # 给旧版keras新增symbolic（装饰器），以兼容optimizers.py
     keras.backend.symbolic = getattr(keras.backend, 'symbolic', None) or symbolic
 
@@ -517,7 +517,9 @@ def pay_attention_to(self, inputs, mask=None, **kwargs):
         q_mask, v_mask = mask
         a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias')
         if a_bias:
+            
             a_bias = inputs[n]
+            
             n += 1
         if p_bias == 'rotary':
             qw, kw = apply_rotary_position_embeddings(inputs[n], qw, kw)
@@ -536,6 +538,7 @@ def pay_attention_to(self, inputs, mask=None, **kwargs):
         if a_bias is not None and ops.ndim(a_bias) == 3:
             a_bias = align(a_bias, [0, -2, -1], ops.ndim(a))
         A = attention_normalize(a, v_mask, -1, self.normalization, a_bias)
+        
         if self.attention_dropout:
             A = self.dropout(A)
         # 完成输出
 
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec 29 19:09:29 2023
+
+@author: Administrator
+"""
+
+#测试一下T5-PEGASUS
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+os.environ["KERAS_BACKEND"] = "jax"
+import numpy as np
+import jieba
+from bert4keras3.models import build_transformer_model
+from bert4keras3.tokenizers import Tokenizer
+from bert4keras3.snippets import AutoRegressiveDecoder
+base_path='models/chinese_t5_pegasus_base/'
+config_path = base_path+'config.json'
+checkpoint_path = base_path+ 'model.ckpt'
+dict_path = base_path+ 'vocab.txt'
+tokenizer= Tokenizer(
+    dict_path,
+    do_lower_case=True,
+    pre_tokenize=lambda s: jieba.cut(s, HMM=False)
+)
+
+
+t5 = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    model='mt5.1.1',
+    return_keras_model=False,
+    name='T5',
+    dropout_rate=0,
+)
+
+encoder = t5.encoder
+decoder = t5.decoder
+t5.model.summary()
+class AutoTitle(AutoRegressiveDecoder):
+    def generate(self, text, topk=1):
+        c_encoded = encoder.predict(np.array([tokenizer.encode(text)[0]]))[0]
+        output_ids=[self.start_id]
+        while output_ids[-1]!=self.end_id and len(output_ids)<128:
+            
+            outs= self.last_token(decoder).predict([np.expand_dims(c_encoded,0),np.reshape(output_ids,[1,-1])],verbose=3)  # 基于beam search
+            out=np.argmax(outs)
+            output_ids.append(out)
+            
+        return tokenizer.decode(output_ids).replace(' ','')
+
+    
+autotitle = AutoTitle(
+    start_id=tokenizer._token_start_id,
+    end_id=tokenizer._token_end_id,
+    maxlen=128
+
+)
+print(autotitle.generate('针对以超立方体网络为蓝本的多处理机系统的可靠性和容错能力的精准度量问题,结合多处理机系统遭受计算机病毒攻击时常常发生结构性故障的特点,研究了n维超立方体网络的结构连通性和子结构连通性评价问题。首先,使 用构造n维超立方体网络的3路结构割的方法得到其3路结构连通度的一个上界;然后,使用构造n维超立方体网络的3路子结构集的等价变换或约简变换的方法,得到其3路结构子连通度的一个下界;最后,利用任意网络的3路结构连通度不小于3路子结构连通度的性质,证实了超立方体网络的3路结构连通度和子结构连通度均为该超立方体网络维数'))
+
+'''
+原版bert4keras的输出是
+针对以超立方体网络为蓝本的多处理机系统的可靠性和容错能力的精准度量问题, 结合多处理机系统遭受计算机病毒攻击时常常发生结构性故障的特点, 研究了n维超立方体网络的结构连通性和子结构连通性评价问题。
+
+'''