OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 19

Commit

0094083

verified ·

1 Parent(s): 9b34589

Update Model.py

Browse files

Files changed (1) hide show

Model.py +25 -51

Model.py CHANGED Viewed

@@ -43,7 +43,7 @@ def download_file(url, save_path):
             f.write(chunk)
     print(f"✅ {save_path} 저장됨")
-DATA_PATH = "converted.jsonl"
 TOKENIZER_PATH = "ko_unigram.model"
 if not os.path.exists(DATA_PATH):
@@ -77,63 +77,37 @@ def text_to_ids(text):
 def ids_to_text(ids):
     return sp.decode(ids)
-def jsonl_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
-            data = json.loads(line)
-            conversations = data.get("conversations", [])
-            for i in range(0, len(conversations) - 1, 2):
-                human_msg = conversations[i]
-                gpt_msg   = conversations[i + 1]
-                if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
-                    continue
-                prompt   = human_msg.get("value", "").strip()
-                response = gpt_msg.get("value", "").strip()
-                full = f"<start> {prompt} <sep> {response} <end>"
-                if "<sep>" not in full:
-                    continue
-                sep_index  = full.index("<sep>")
-                input_text = full[:sep_index + len("<sep>")].strip()
-                target_text = full[sep_index + len("<sep>"):].strip()
-                input_ids  = text_to_ids(input_text)
-                target_ids = text_to_ids(target_text + " <end>")
-                available_len = max_len - len(input_ids)
-                if available_len <= 0:
-                    input_ids = input_ids[-max_len:]
-                    target_ids = []
-                    target_mask = [0] * len(input_ids)
-                else:
-                    target_ids = target_ids[:available_len]
-                    target_mask = [0] * len(input_ids) + [1] * len(target_ids)
-                full_input = input_ids + target_ids
-                pad_len = max_len - len(full_input)
-                full_input += [pad_id] * pad_len
-                target_mask += [0] * pad_len
-                target_seq = full_input[1:] + [end_id]
-                target_seq = target_seq[:max_len]
-                masked_target = [
-                    t if m == 1 else pad_id
-                    for t, m in zip(target_seq, target_mask)
-                ]
-                yield (
-                    tf.convert_to_tensor(full_input, dtype=tf.int32),
-                    tf.convert_to_tensor(masked_target, dtype=tf.int32)
-                )
 dataset = tf.data.Dataset.from_generator(
-    lambda: jsonl_stream(DATA_PATH),
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
-    ),
 )
-dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
 with strategy.scope():
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
@@ -367,7 +341,7 @@ model.save_weights("Cobra.weights.h5")
 print("모델 가중치 저장 완료!")
 def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
-    model_input = text_to_ids(f"<start> {prompt} <sep>")
     model_input = model_input[:max_len]
     generated = list(model_input)
     for step in range(max_gen):
@@ -396,4 +370,4 @@ def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperatu
     return ids_to_text(generated)
 print("\n\n===== 생성 결과 =====")
-print(generate_text_topp(model, "안녕", p=0.9))

             f.write(chunk)
     print(f"✅ {save_path} 저장됨")
+DATA_PATH = "corpus.txt"
 TOKENIZER_PATH = "ko_unigram.model"
 if not os.path.exists(DATA_PATH):
 def ids_to_text(ids):
     return sp.decode(ids)
+def txt_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
+            text = line.strip()
+            if not text:
+                continue
+            ids = text_to_ids(text)
+            ids = ids[:max_len - 1]  # 마지막에 <end> 넣기 위해 -1
+            full_input = ids + [end_id]
+            pad_len = max_len - len(full_input)
+            full_input += [pad_id] * pad_len
+            # target = next-token shifted sequence
+            target = full_input[1:] + [pad_id]
+            yield (
+                tf.convert_to_tensor(full_input, dtype=tf.int32),
+                tf.convert_to_tensor(target, dtype=tf.int32)
+            )
 dataset = tf.data.Dataset.from_generator(
+    lambda: txt_stream(DATA_PATH),
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
+    )
 )
+dataset = dataset.shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
 with strategy.scope():
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
 print("모델 가중치 저장 완료!")
 def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
+    model_input = text_to_ids(f"<start> {prompt}")
     model_input = model_input[:max_len]
     generated = list(model_input)
     for step in range(max_gen):
     return ids_to_text(generated)
 print("\n\n===== 생성 결과 =====")
+print(generate_text_topp(model, "", p=0.9))