OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on 30 days ago

Commit

0a74012

verified ·

1 Parent(s): 670fb78

Update Model.py

Browse files

Files changed (1) hide show

Model.py +133 -95

Model.py CHANGED Viewed

@@ -1,124 +1,162 @@
-import json
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-from tensorflow.keras import layers
-import sentencepiece as spm
 import requests
-# ⬇️ 파일 다운로드 함수
-def download_file(url, save_path):
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    with open(save_path, 'wb') as f:
-        for chunk in response.iter_content(chunk_size=8192):
-            f.write(chunk)
-    print(f"✅ 파일 저장됨: {save_path}")
-# ⬇️ 데이터와 토크나이저 다운로드
-download_file('https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/ko_unigram.model?download=true', 'ko_unigram.model')
-download_file('https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet?download=true', 'dataset.parquet')
-# ⬇️ Parquet 데이터 불러오기
-df = pd.read_parquet("dataset.parquet", engine="pyarrow")
-# ⬇️ <start> 질문 <sep> 답변 <end> 포맷으로 변환
-train_sentences = []
-for conversations in df["conversations"]:
-    for i in range(0, len(conversations) - 1, 2):
-        item1, item2 = conversations[i], conversations[i + 1]
-        if item1.get("from") == "human" and item2.get("from") == "gpt":
-            prompt = item1.get("value", "").strip().replace("\n", " ")
-            response = item2.get("value", "").strip().replace("\n", " ")
-            full = f"<start> {prompt} <sep> {response} <end>"
-            train_sentences.append(full)
-train_sentences = train_sentences
-print(f"총 문장 개수: {len(train_sentences)}")
-# ⬇️ 토크나이저 불러오기
-sp = spm.SentencePieceProcessor()
-sp.load("ko_unigram.model")
-# ⬇️ 특수 토큰 ID 추출
-pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
-start_id = sp.piece_to_id("<start>")
-sep_id = sp.piece_to_id("<sep>")
-end_id = sp.piece_to_id("<end>")
-unk_id = sp.piece_to_id("<unk>")
-vocab_size = sp.get_piece_size()
-print(f"✅ Vocabulary size: {vocab_size}")
-# ⬇️ 텍스트 <-> ID 변환 함수
-def text_to_ids(text):
-    return sp.encode(text, out_type=int)
-def ids_to_text(ids):
-    return sp.decode(ids)
-# ⬇️ 전처리 하이퍼파라미터
-max_len = 230
-batch_size = 128
-# ⬇️ 인풋과 타겟 마스킹 포함된 전처리
-encoded_inputs = []
-targets = []
-for sentence in train_sentences:
-    if "<sep>" not in sentence:
-        continue
-    sep_index = sentence.index("<sep>")
-    input_text = sentence[:sep_index + len("<sep>")].strip()
-    target_text = sentence[sep_index + len("<sep>"):].strip()
-    input_ids = text_to_ids(input_text)
-    target_ids = text_to_ids(target_text + " <end>")
-    full_input = input_ids + target_ids
-    full_input = full_input[:max_len]
-    target_mask = [0] * len(input_ids) + [1] * len(target_ids)
-    target_mask = target_mask[:max_len]
-    if len(full_input) < max_len:
-        pad_len = max_len - len(full_input)
-        full_input += [pad_id] * pad_len
-        target_mask += [0] * pad_len
-    encoded_inputs.append(full_input)
-    target_seq = full_input[1:] + [end_id]
-    target_seq = target_seq[:max_len]
-    masked_target = [
-        t if m == 1 else pad_id
-        for t, m in zip(target_seq, target_mask)
-    ]
-    targets.append(masked_target)
-# ⬇️ 넘파이 변환
-encoded_inputs = np.array(encoded_inputs)
-targets = np.array(targets)
-# ⬇️ TensorFlow Dataset 생성
-def data_generator():
-    for input_seq, target_seq in zip(encoded_inputs, targets):
-        yield input_seq, target_seq
 dataset = tf.data.Dataset.from_generator(
-    data_generator,
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
-        tf.TensorSpec(shape=(max_len,), dtype=tf.int32)
-    )
 )
-dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
-print("✅ TF Dataset 생성 완료!")
 class Lo(layers.Layer):
     def __init__(self, d_model):

+pip install sentencepiece
+import sentencepiece as spm
+import os, json, numpy as np, tensorflow as tf
+from tensorflow.keras import layers, Model
 import requests
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow.keras.backend as K
+print('1')
+tf.get_logger().setLevel("ERROR")
+SEED = 42
+tf.random.set_seed(SEED)
+np.random.seed(SEED)
+# TPU 초기화
+try:
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.TPUStrategy(resolver)
+    print("✅ TPU 초기화 완료:", resolver.cluster_spec().as_dict())
+    on_tpu = True
+except Exception as e:
+    print("⚠️ TPU 미사용, GPU/CPU로 진행:", e)
+    strategy = tf.distribute.get_strategy()
+    on_tpu = False
+# Mixed precision
+from tensorflow.keras import mixed_precision
+policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
+mixed_precision.set_global_policy(policy)
+print("✅ Mixed precision:", policy)
+# =======================
+# 1) 파일 다운로드
+# =======================
+def download_file(url, save_path):
+    r = requests.get(url, stream=True)
+    r.raise_for_status()
+    with open(save_path, "wb") as f:
+        for chunk in r.iter_content(8192*2):
+            f.write(chunk)
+    print(f"✅ {save_path} 저장됨")
+DATA_PATH = "converted.jsonl"
+TOKENIZER_PATH = "ko_unigram.model"
+if not os.path.exists(DATA_PATH):
+    download_file(
+        "https://huggingface.co/datasets/Yuchan5386/SFT/resolve/main/data_shuffled_1.jsonl?download=true",
+        DATA_PATH
+    )
+if not os.path.exists(TOKENIZER_PATH):
+    download_file(
+        "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
+        TOKENIZER_PATH
+    )
+sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
+pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+start_id = sp.piece_to_id("<start>")
+sep_id = sp.piece_to_id("<sep>")
+end_id = sp.piece_to_id("<end>")
+unk_id = sp.piece_to_id("<unk>")
+vocab_size = sp.get_piece_size()
+print(f"✅ Vocabulary size: {vocab_size}")
+max_len = 200
+batch_size = 128
+def text_to_ids(text):
+    return sp.encode(text, out_type=int)
+def ids_to_text(ids):
+    return sp.decode(ids)
+def jsonl_stream(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            conversations = data.get("conversations", [])
+            for i in range(0, len(conversations) - 1, 2):
+                human_msg = conversations[i]
+                gpt_msg   = conversations[i + 1]
+                if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
+                    continue
+                prompt   = human_msg.get("value", "").strip()
+                response = gpt_msg.get("value", "").strip()
+                full = f"<start> {prompt} <sep> {response} <end>"
+                if "<sep>" not in full:
+                    continue
+                sep_index  = full.index("<sep>")
+                input_text = full[:sep_index + len("<sep>")].strip()
+                target_text = full[sep_index + len("<sep>"):].strip()
+                input_ids  = text_to_ids(input_text)
+                target_ids = text_to_ids(target_text + " <end>")
+                available_len = max_len - len(input_ids)
+                if available_len <= 0:
+                    input_ids = input_ids[-max_len:]
+                    target_ids = []
+                    target_mask = [0] * len(input_ids)
+                else:
+                    target_ids = target_ids[:available_len]
+                    target_mask = [0] * len(input_ids) + [1] * len(target_ids)
+                full_input = input_ids + target_ids
+                pad_len = max_len - len(full_input)
+                full_input += [pad_id] * pad_len
+                target_mask += [0] * pad_len
+                target_seq = full_input[1:] + [end_id]
+                target_seq = target_seq[:max_len]
+                masked_target = [
+                    t if m == 1 else pad_id
+                    for t, m in zip(target_seq, target_mask)
+                ]
+                yield (
+                    tf.convert_to_tensor(full_input, dtype=tf.int32),
+                    tf.convert_to_tensor(masked_target, dtype=tf.int32)
+                )
 dataset = tf.data.Dataset.from_generator(
+    lambda: jsonl_stream(DATA_PATH),
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
+    ),
 )
+dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
+with strategy.scope():
+    dist_dataset = strategy.experimental_distribute_dataset(dataset)
 class Lo(layers.Layer):
     def __init__(self, d_model):