Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,13 +22,13 @@ TOKENIZER_PATH = "bpe.model"
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
-
"https://huggingface.co/OpenLab-NLP/
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
-
"https://huggingface.co/OpenLab-NLP/
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
|
@@ -50,45 +50,72 @@ def encode_sentence(sentence, max_len=MAX_LEN):
|
|
| 50 |
def pad_sentence(tokens):
|
| 51 |
return tokens + [pad_id]*(MAX_LEN - len(tokens))
|
| 52 |
|
| 53 |
-
class EncoderBlock(layers.Layer):
|
| 54 |
-
def __init__(self, embed_dim=EMBED_DIM,
|
| 55 |
-
super().__init__()
|
| 56 |
-
self.
|
| 57 |
-
self.
|
| 58 |
-
|
| 59 |
-
self.
|
| 60 |
-
self.
|
| 61 |
-
self.
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
attn =
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
class L2NormLayer(layers.Layer):
|
| 74 |
def __init__(self, axis=1, epsilon=1e-10, **kwargs):
|
| 75 |
super().__init__(**kwargs)
|
| 76 |
self.axis = axis
|
| 77 |
self.epsilon = epsilon
|
| 78 |
-
|
| 79 |
def call(self, inputs):
|
| 80 |
return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
|
| 81 |
-
|
| 82 |
def get_config(self):
|
| 83 |
return {"axis": self.axis, "epsilon": self.epsilon, **super().get_config()}
|
| 84 |
|
| 85 |
class SentenceEncoder(tf.keras.Model):
|
| 86 |
-
def __init__(self, vocab_size, embed_dim=384, latent_dim=384, max_len=128, pad_id=
|
| 87 |
super().__init__()
|
| 88 |
self.pad_id = pad_id
|
| 89 |
self.embed = layers.Embedding(vocab_size, embed_dim)
|
| 90 |
self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
|
| 91 |
-
self.blocks = [EncoderBlock(
|
| 92 |
self.attn_pool = layers.Dense(1)
|
| 93 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
|
| 94 |
self.latent = layers.Dense(latent_dim, activation=None) # tanh 제거
|
|
@@ -98,10 +125,9 @@ class SentenceEncoder(tf.keras.Model):
|
|
| 98 |
positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
|
| 99 |
x_embed = self.embed(x) + self.pos_embed(positions)
|
| 100 |
mask = tf.cast(tf.not_equal(x, self.pad_id), tf.float32)
|
| 101 |
-
|
| 102 |
x = x_embed
|
| 103 |
for block in self.blocks:
|
| 104 |
-
x = block(x)
|
| 105 |
x = self.ln_f(x)
|
| 106 |
|
| 107 |
scores = self.attn_pool(x)
|
|
@@ -110,7 +136,8 @@ class SentenceEncoder(tf.keras.Model):
|
|
| 110 |
pooled = tf.reduce_sum(x * scores, axis=1)
|
| 111 |
|
| 112 |
latent = self.latent(pooled)
|
| 113 |
-
return self.l2norm(latent) # L2
|
|
|
|
| 114 |
# 3️⃣ 모델 로드
|
| 115 |
# ===============================
|
| 116 |
encoder = SentenceEncoder(vocab_size=vocab_size)
|
|
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
+
"https://huggingface.co/OpenLab-NLP/openlem-prototype/resolve/main/encoder_simcse.weights.h5?download=true",
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
+
"https://huggingface.co/OpenLab-NLP/openlem-prototype/resolve/main/bpe.model?download=true",
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
|
|
|
| 50 |
def pad_sentence(tokens):
|
| 51 |
return tokens + [pad_id]*(MAX_LEN - len(tokens))
|
| 52 |
|
| 53 |
+
class EncoderBlock(tf.keras.layers.Layer):
|
| 54 |
+
def __init__(self, embed_dim=EMBED_DIM, ff_dim=1152, seq_len=MAX_LEN):
|
| 55 |
+
super().__init__()
|
| 56 |
+
self.embed_dim = embed_dim
|
| 57 |
+
self.seq_len = seq_len
|
| 58 |
+
|
| 59 |
+
self.fc1 = layers.Dense(ff_dim)
|
| 60 |
+
self.fc2 = layers.Dense(embed_dim)
|
| 61 |
+
self.fc3 = layers.Dense(ff_dim//2)
|
| 62 |
+
self.fc4 = layers.Dense(embed_dim)
|
| 63 |
+
|
| 64 |
+
self.attn = layers.Dense(1)
|
| 65 |
+
self.token_mixer = layers.Dense(seq_len)
|
| 66 |
+
self.token_gate = layers.Dense(seq_len, activation='sigmoid')
|
| 67 |
+
|
| 68 |
+
self.ln = layers.LayerNormalization(epsilon=1e-5)
|
| 69 |
+
self.ln1 = layers.LayerNormalization(epsilon=1e-5)
|
| 70 |
+
self.ln2 = layers.LayerNormalization(epsilon=1e-5)
|
| 71 |
+
self.ln3 = layers.LayerNormalization(epsilon=1e-5)
|
| 72 |
+
self.ln4 = layers.LayerNormalization(epsilon=1e-5)
|
| 73 |
+
|
| 74 |
+
def call(self, x, mask):
|
| 75 |
+
mask = mask
|
| 76 |
+
# x: (B, L, D)
|
| 77 |
+
x_norm = self.ln(x)
|
| 78 |
+
|
| 79 |
+
h = self.fc1(x_norm)
|
| 80 |
+
g, v = tf.split(h, 2, axis=-1)
|
| 81 |
+
h = tf.nn.silu(g) * v
|
| 82 |
+
h = self.fc2(h)
|
| 83 |
+
|
| 84 |
+
h = x + self.ln1(h)
|
| 85 |
+
|
| 86 |
+
scores = self.attn(h)
|
| 87 |
+
scores = tf.where(tf.equal(mask[..., tf.newaxis], 0), -1e9, scores)
|
| 88 |
+
scores = tf.nn.softmax(scores, axis=1)
|
| 89 |
+
attn = h + self.ln2(h * scores)
|
| 90 |
+
|
| 91 |
+
v = tf.transpose(attn, [0, 2, 1])
|
| 92 |
+
v = self.token_mixer(v) * self.token_gate(v)
|
| 93 |
+
v = tf.transpose(v, [0, 2, 1])
|
| 94 |
+
|
| 95 |
+
x_norm = attn + self.ln3(v)
|
| 96 |
+
x = self.fc3(x_norm)
|
| 97 |
+
x = tf.nn.silu(x)
|
| 98 |
+
x = self.fc4(x)
|
| 99 |
+
|
| 100 |
+
return x_norm + self.ln4(x)
|
| 101 |
|
| 102 |
class L2NormLayer(layers.Layer):
|
| 103 |
def __init__(self, axis=1, epsilon=1e-10, **kwargs):
|
| 104 |
super().__init__(**kwargs)
|
| 105 |
self.axis = axis
|
| 106 |
self.epsilon = epsilon
|
|
|
|
| 107 |
def call(self, inputs):
|
| 108 |
return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
|
|
|
|
| 109 |
def get_config(self):
|
| 110 |
return {"axis": self.axis, "epsilon": self.epsilon, **super().get_config()}
|
| 111 |
|
| 112 |
class SentenceEncoder(tf.keras.Model):
|
| 113 |
+
def __init__(self, vocab_size, embed_dim=384, latent_dim=384, max_len=128, pad_id=pad_id):
|
| 114 |
super().__init__()
|
| 115 |
self.pad_id = pad_id
|
| 116 |
self.embed = layers.Embedding(vocab_size, embed_dim)
|
| 117 |
self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
|
| 118 |
+
self.blocks = [EncoderBlock() for _ in range(2)]
|
| 119 |
self.attn_pool = layers.Dense(1)
|
| 120 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
|
| 121 |
self.latent = layers.Dense(latent_dim, activation=None) # tanh 제거
|
|
|
|
| 125 |
positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
|
| 126 |
x_embed = self.embed(x) + self.pos_embed(positions)
|
| 127 |
mask = tf.cast(tf.not_equal(x, self.pad_id), tf.float32)
|
|
|
|
| 128 |
x = x_embed
|
| 129 |
for block in self.blocks:
|
| 130 |
+
x = block(x, mask)
|
| 131 |
x = self.ln_f(x)
|
| 132 |
|
| 133 |
scores = self.attn_pool(x)
|
|
|
|
| 136 |
pooled = tf.reduce_sum(x * scores, axis=1)
|
| 137 |
|
| 138 |
latent = self.latent(pooled)
|
| 139 |
+
return self.l2norm(latent) # L2 정규화 후 반환
|
| 140 |
+
|
| 141 |
# 3️⃣ 모델 로드
|
| 142 |
# ===============================
|
| 143 |
encoder = SentenceEncoder(vocab_size=vocab_size)
|