OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 14

Commit

c292f6f

verified ·

1 Parent(s): 88215f4

Update Model.py

Browse files

Files changed (1) hide show

Model.py +40 -17

Model.py CHANGED Viewed

@@ -139,52 +139,71 @@ class Lo(layers.Layer):
 class LoSoU(layers.Layer):
     """
-    안정화된 LoSoU 레이어
     - 누적합 대신 지수이동평균(EMA) 사용 (alpha: smoothing factor)
     - 내부 계산은 float32로 수행 (TPU bfloat16 안정성 향상)
     - EMA 결과 클리핑 및 작은 epsilon 적용
     - 안전한 split 처리 (짝수 차원 가정; 아니라면 마지막 차원 pad 필요)
     """
-    def __init__(self, d_model, alpha=0.15, clip_value=5.0, eps=1e-6):
         super().__init__()
         # 대부분 연산을 float32로 수행
         self.d_model = d_model
-        self.alpha = float(alpha)
         self.clip_value = float(clip_value)
         self.eps = float(eps)
         # projection / gating layers in float32
         self.Q = layers.Dense(96, dtype='float32')
         self.K = layers.Dense(96, dtype='float32')
-        # V produces d_model so keep it float32 internally
         self.V = Lo(d_model)  # Lo already handles casting to model dtype; we'll cast back to float32
         self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.O = layers.Dense(d_model, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-    def _ema_over_time(self, score):
         # score: (B, L, D) float32 in [0,1] roughly
-        alpha = tf.constant(self.alpha, dtype=score.dtype)
         # transpose to (L, B, D) to scan over time steps
-        seq = tf.transpose(score, perm=[1, 0, 2])
-        def step(prev_ema, x_t):
-            # prev_ema: (B, D), x_t: (B, D)
-            new = alpha * x_t + (1.0 - alpha) * prev_ema
             return new
         # 초기값을 첫 step 값으로 설정
-        init = seq[0]
-        ema_seq = tf.scan(fn=step, elems=seq[1:], initializer=init)
         ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0)  # (L, B, D)
         # transpose back to (B, L, D)
         ema = tf.transpose(ema_seq, perm=[1, 0, 2])
         return ema
     def call(self, x):
         # x: (B, L, d_model) maybe bfloat16 or float32
         # cast to float32 for all internal computations
@@ -192,8 +211,8 @@ class LoSoU(layers.Layer):
         residual = x_f32
         # Q, K, V
-        q = self.Q(x_f32)   # (B, L, 128)
-        k = self.K(x_f32)   # (B, L, 128)
         V = tf.cast(self.V(x), tf.float32)  # ensure V's output is float32
         # gating signals in (0,1)
@@ -203,8 +222,13 @@ class LoSoU(layers.Layer):
         # elementwise product -> bounded roughly [0,1]
         score = g_q * g_k
         # EMA across time (stable alternative to cumsum)
-        score_ema = self._ema_over_time(score)
         # optionally normalize by (mean + eps) across last dim to reduce scale variations
         mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True)  # (B, L, 1)
@@ -224,7 +248,6 @@ class LoSoU(layers.Layer):
         if d is not None and d % 2 == 1:
             out = tf.pad(out, [[0,0],[0,0],[0,1]])
         a, b = tf.split(out, 2, axis=-1)
         gated = tf.nn.silu(a) * b
         out = self.O(gated)

 class LoSoU(layers.Layer):
     """
+    안정화된 LoSoU 레이어 (동적 alpha 사용)
+    - alpha 값을 입력에 따라 동적으로 계산: alpha = sigmoid(Linear(x))
     - 누적합 대신 지수이동평균(EMA) 사용 (alpha: smoothing factor)
     - 내부 계산은 float32로 수행 (TPU bfloat16 안정성 향상)
     - EMA 결과 클리핑 및 작은 epsilon 적용
     - 안전한 split 처리 (짝수 차원 가정; 아니라면 마지막 차원 pad 필요)
     """
+    def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         # 대부분 연산을 float32로 수행
         self.d_model = d_model
         self.clip_value = float(clip_value)
         self.eps = float(eps)
         # projection / gating layers in float32
         self.Q = layers.Dense(96, dtype='float32')
         self.K = layers.Dense(96, dtype='float32')
         self.V = Lo(d_model)  # Lo already handles casting to model dtype; we'll cast back to float32
         self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.O = layers.Dense(d_model, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        # 동적 alpha 계산을 위한 레이어
+        # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
+        # 입력 x의 d_model 차원을 사용하여 각 샘플에 대해 alpha 계산
+        # 예: (B, L, d_model) -> (B, L, 1) -> (B, L, 1) with sigmoid
+        # 또는 (B, L, d_model) -> (B, L, d_model) -> global reduce -> (B, L, 1)
+        # 간단히 각 위치에 대해 동일한 alpha 사용 (입력의 평균 기반)
+        # 또는 위치별로 다르게 사용 (각 위치에 대해 계산)
+        # 여기서는 위치별로 다르게 계산 (B, L, 1)
+        self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
+    def _ema_over_time(self, score, alpha_dynamic):
         # score: (B, L, D) float32 in [0,1] roughly
+        # alpha_dynamic: (B, L, 1) float32 in [0,1]
         # transpose to (L, B, D) to scan over time steps
+        seq = tf.transpose(score, perm=[1, 0, 2])  # (L, B, D)
+        alpha_seq = tf.transpose(alpha_dynamic, perm=[1, 0, 2])  # (L, B, 1)
+        def step(prev_ema, inputs):
+            x_t, alpha_t = inputs
+            # prev_ema: (B, D), x_t: (B, D), alpha_t: (B, 1)
+            new = alpha_t * x_t + (1.0 - alpha_t) * prev_ema
             return new
         # 초기값을 첫 step 값으로 설정
+        init = seq[0]  # (B, D)
+        first_alpha = alpha_seq[0]  # (B, 1)
+        # scan의 elems는 (L-1, B, D) 및 (L-1, B, 1) 이어야 함
+        remaining_seq = seq[1:]  # (L-1, B, D)
+        remaining_alpha = alpha_seq[1:]  # (L-1, B, 1)
+        # elems는 두 텐서의 튜플로 구성: (x_t, alpha_t)
+        elems = (remaining_seq, remaining_alpha)
+        ema_seq = tf.scan(fn=step, elems=elems, initializer=init)
+        # 초기값 포함
         ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0)  # (L, B, D)
         # transpose back to (B, L, D)
         ema = tf.transpose(ema_seq, perm=[1, 0, 2])
         return ema
     def call(self, x):
         # x: (B, L, d_model) maybe bfloat16 or float32
         # cast to float32 for all internal computations
         residual = x_f32
         # Q, K, V
+        q = self.Q(x_f32)   # (B, L, 96)
+        k = self.K(x_f32)   # (B, L, 96)
         V = tf.cast(self.V(x), tf.float32)  # ensure V's output is float32
         # gating signals in (0,1)
         # elementwise product -> bounded roughly [0,1]
         score = g_q * g_k
+        # 동적 alpha 계산: (B, L, d_model) -> (B, L, 1)
+        alpha_dynamic = self.alpha_linear(x_f32)  # (B, L, 1)
+        # 필요시 alpha_dynamic에 대한 후처리 (예: min/max 등) 가능
+        # ex: alpha_dynamic = tf.clip_by_value(alpha_dynamic, 0.01, 0.99)
         # EMA across time (stable alternative to cumsum)
+        score_ema = self._ema_over_time(score, alpha_dynamic)
         # optionally normalize by (mean + eps) across last dim to reduce scale variations
         mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True)  # (B, L, 1)
         if d is not None and d % 2 == 1:
             out = tf.pad(out, [[0,0],[0,0],[0,1]])
         a, b = tf.split(out, 2, axis=-1)
         gated = tf.nn.silu(a) * b
         out = self.O(gated)