Yuchan
commited on
Update Model.py
Browse files
Model.py
CHANGED
|
@@ -43,7 +43,7 @@ def download_file(url, save_path):
|
|
| 43 |
f.write(chunk)
|
| 44 |
print(f"✅ {save_path} 저장됨")
|
| 45 |
|
| 46 |
-
DATA_PATH = "
|
| 47 |
TOKENIZER_PATH = "ko_unigram.model"
|
| 48 |
|
| 49 |
if not os.path.exists(DATA_PATH):
|
|
@@ -77,63 +77,37 @@ def text_to_ids(text):
|
|
| 77 |
def ids_to_text(ids):
|
| 78 |
return sp.decode(ids)
|
| 79 |
|
| 80 |
-
|
| 81 |
-
def jsonl_stream(file_path):
|
| 82 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 83 |
for line in f:
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
target_ids = text_to_ids(target_text + " <end>")
|
| 103 |
-
available_len = max_len - len(input_ids)
|
| 104 |
-
|
| 105 |
-
if available_len <= 0:
|
| 106 |
-
input_ids = input_ids[-max_len:]
|
| 107 |
-
target_ids = []
|
| 108 |
-
target_mask = [0] * len(input_ids)
|
| 109 |
-
else:
|
| 110 |
-
target_ids = target_ids[:available_len]
|
| 111 |
-
target_mask = [0] * len(input_ids) + [1] * len(target_ids)
|
| 112 |
-
|
| 113 |
-
full_input = input_ids + target_ids
|
| 114 |
-
pad_len = max_len - len(full_input)
|
| 115 |
-
full_input += [pad_id] * pad_len
|
| 116 |
-
target_mask += [0] * pad_len
|
| 117 |
-
target_seq = full_input[1:] + [end_id]
|
| 118 |
-
target_seq = target_seq[:max_len]
|
| 119 |
-
masked_target = [
|
| 120 |
-
t if m == 1 else pad_id
|
| 121 |
-
for t, m in zip(target_seq, target_mask)
|
| 122 |
-
]
|
| 123 |
-
yield (
|
| 124 |
-
tf.convert_to_tensor(full_input, dtype=tf.int32),
|
| 125 |
-
tf.convert_to_tensor(masked_target, dtype=tf.int32)
|
| 126 |
-
)
|
| 127 |
|
| 128 |
dataset = tf.data.Dataset.from_generator(
|
| 129 |
-
lambda:
|
| 130 |
output_signature=(
|
| 131 |
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
|
| 132 |
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
|
| 133 |
-
)
|
| 134 |
)
|
| 135 |
|
| 136 |
-
dataset = dataset.shuffle(
|
| 137 |
|
| 138 |
with strategy.scope():
|
| 139 |
dist_dataset = strategy.experimental_distribute_dataset(dataset)
|
|
@@ -367,7 +341,7 @@ model.save_weights("Cobra.weights.h5")
|
|
| 367 |
print("모델 가중치 저장 완료!")
|
| 368 |
|
| 369 |
def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
|
| 370 |
-
model_input = text_to_ids(f"<start> {prompt}
|
| 371 |
model_input = model_input[:max_len]
|
| 372 |
generated = list(model_input)
|
| 373 |
for step in range(max_gen):
|
|
@@ -396,4 +370,4 @@ def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperatu
|
|
| 396 |
return ids_to_text(generated)
|
| 397 |
|
| 398 |
print("\n\n===== 생성 결과 =====")
|
| 399 |
-
print(generate_text_topp(model, "
|
|
|
|
| 43 |
f.write(chunk)
|
| 44 |
print(f"✅ {save_path} 저장됨")
|
| 45 |
|
| 46 |
+
DATA_PATH = "corpus.txt"
|
| 47 |
TOKENIZER_PATH = "ko_unigram.model"
|
| 48 |
|
| 49 |
if not os.path.exists(DATA_PATH):
|
|
|
|
| 77 |
def ids_to_text(ids):
|
| 78 |
return sp.decode(ids)
|
| 79 |
|
| 80 |
+
def txt_stream(file_path):
|
|
|
|
| 81 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 82 |
for line in f:
|
| 83 |
+
text = line.strip()
|
| 84 |
+
if not text:
|
| 85 |
+
continue
|
| 86 |
+
|
| 87 |
+
ids = text_to_ids(text)
|
| 88 |
+
ids = ids[:max_len - 1] # 마지막에 <end> 넣기 위해 -1
|
| 89 |
+
|
| 90 |
+
full_input = ids + [end_id]
|
| 91 |
+
pad_len = max_len - len(full_input)
|
| 92 |
+
full_input += [pad_id] * pad_len
|
| 93 |
+
|
| 94 |
+
# target = next-token shifted sequence
|
| 95 |
+
target = full_input[1:] + [pad_id]
|
| 96 |
+
yield (
|
| 97 |
+
tf.convert_to_tensor(full_input, dtype=tf.int32),
|
| 98 |
+
tf.convert_to_tensor(target, dtype=tf.int32)
|
| 99 |
+
)
|
| 100 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
dataset = tf.data.Dataset.from_generator(
|
| 103 |
+
lambda: txt_stream(DATA_PATH),
|
| 104 |
output_signature=(
|
| 105 |
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
|
| 106 |
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
|
| 107 |
+
)
|
| 108 |
)
|
| 109 |
|
| 110 |
+
dataset = dataset.shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
|
| 111 |
|
| 112 |
with strategy.scope():
|
| 113 |
dist_dataset = strategy.experimental_distribute_dataset(dataset)
|
|
|
|
| 341 |
print("모델 가중치 저장 완료!")
|
| 342 |
|
| 343 |
def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
|
| 344 |
+
model_input = text_to_ids(f"<start> {prompt}")
|
| 345 |
model_input = model_input[:max_len]
|
| 346 |
generated = list(model_input)
|
| 347 |
for step in range(max_gen):
|
|
|
|
| 370 |
return ids_to_text(generated)
|
| 371 |
|
| 372 |
print("\n\n===== 생성 결과 =====")
|
| 373 |
+
print(generate_text_topp(model, "", p=0.9))
|