Yuchan commited on
Commit
0a74012
ยท
verified ยท
1 Parent(s): 670fb78

Update Model.py

Browse files
Files changed (1) hide show
  1. Model.py +133 -95
Model.py CHANGED
@@ -1,124 +1,162 @@
1
- import json
2
- import numpy as np
3
- import pandas as pd
4
- import tensorflow as tf
5
- from tensorflow.keras import layers
6
- import sentencepiece as spm
 
 
7
  import requests
8
 
9
- # โฌ‡๏ธ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ ํ•จ์ˆ˜
10
- def download_file(url, save_path):
11
- response = requests.get(url, stream=True)
12
- response.raise_for_status()
13
- with open(save_path, 'wb') as f:
14
- for chunk in response.iter_content(chunk_size=8192):
15
- f.write(chunk)
16
- print(f"โœ… ํŒŒ์ผ ์ €์žฅ๋จ: {save_path}")
17
-
18
- # โฌ‡๏ธ ๋ฐ์ดํ„ฐ์™€ ํ† ํฌ๋‚˜์ด์ € ๋‹ค์šด๋กœ๋“œ
19
- download_file('https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/ko_unigram.model?download=true', 'ko_unigram.model')
20
- download_file('https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet?download=true', 'dataset.parquet')
21
-
22
- # โฌ‡๏ธ Parquet ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
23
- df = pd.read_parquet("dataset.parquet", engine="pyarrow")
24
-
25
- # โฌ‡๏ธ <start> ์งˆ๋ฌธ <sep> ๋‹ต๋ณ€ <end> ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜
26
- train_sentences = []
27
-
28
- for conversations in df["conversations"]:
29
- for i in range(0, len(conversations) - 1, 2):
30
- item1, item2 = conversations[i], conversations[i + 1]
31
- if item1.get("from") == "human" and item2.get("from") == "gpt":
32
- prompt = item1.get("value", "").strip().replace("\n", " ")
33
- response = item2.get("value", "").strip().replace("\n", " ")
34
- full = f"<start> {prompt} <sep> {response} <end>"
35
- train_sentences.append(full)
36
- train_sentences = train_sentences
37
- print(f"์ด ๋ฌธ์žฅ ๊ฐœ์ˆ˜: {len(train_sentences)}")
38
-
39
- # โฌ‡๏ธ ํ† ํฌ๋‚˜์ด์ € ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
40
- sp = spm.SentencePieceProcessor()
41
- sp.load("ko_unigram.model")
42
-
43
- # โฌ‡๏ธ ํŠน์ˆ˜ ํ† ํฐ ID ์ถ”์ถœ
44
- pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
45
- start_id = sp.piece_to_id("<start>")
46
- sep_id = sp.piece_to_id("<sep>")
47
- end_id = sp.piece_to_id("<end>")
48
- unk_id = sp.piece_to_id("<unk>")
49
 
50
- vocab_size = sp.get_piece_size()
51
- print(f"โœ… Vocabulary size: {vocab_size}")
52
 
53
- # โฌ‡๏ธ ํ…์ŠคํŠธ <-> ID ๋ณ€ํ™˜ ํ•จ์ˆ˜
54
- def text_to_ids(text):
55
- return sp.encode(text, out_type=int)
56
 
57
- def ids_to_text(ids):
58
- return sp.decode(ids)
59
 
60
- # โฌ‡๏ธ ์ „์ฒ˜๋ฆฌ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ
61
- max_len = 230
62
- batch_size = 128
63
 
64
- # โฌ‡๏ธ ์ธํ’‹๊ณผ ํƒ€๊ฒŸ ๋งˆ์Šคํ‚น ํฌํ•จ๋œ ์ „์ฒ˜๋ฆฌ
65
- encoded_inputs = []
66
- targets = []
67
 
68
- for sentence in train_sentences:
69
- if "<sep>" not in sentence:
70
- continue
71
 
72
- sep_index = sentence.index("<sep>")
73
- input_text = sentence[:sep_index + len("<sep>")].strip()
74
- target_text = sentence[sep_index + len("<sep>"):].strip()
75
 
76
- input_ids = text_to_ids(input_text)
77
- target_ids = text_to_ids(target_text + " <end>")
78
 
79
- full_input = input_ids + target_ids
80
- full_input = full_input[:max_len]
81
 
82
- target_mask = [0] * len(input_ids) + [1] * len(target_ids)
83
- target_mask = target_mask[:max_len]
84
 
85
- if len(full_input) < max_len:
86
- pad_len = max_len - len(full_input)
87
- full_input += [pad_id] * pad_len
88
- target_mask += [0] * pad_len
89
 
90
- encoded_inputs.append(full_input)
91
 
92
- target_seq = full_input[1:] + [end_id]
93
- target_seq = target_seq[:max_len]
94
 
95
- masked_target = [
96
- t if m == 1 else pad_id
97
- for t, m in zip(target_seq, target_mask)
98
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- targets.append(masked_target)
 
 
 
 
 
 
 
101
 
102
- # โฌ‡๏ธ ๋„˜ํŒŒ์ด ๋ณ€ํ™˜
103
- encoded_inputs = np.array(encoded_inputs)
104
- targets = np.array(targets)
105
 
106
- # โฌ‡๏ธ TensorFlow Dataset ์ƒ์„ฑ
107
- def data_generator():
108
- for input_seq, target_seq in zip(encoded_inputs, targets):
109
- yield input_seq, target_seq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  dataset = tf.data.Dataset.from_generator(
112
- data_generator,
113
  output_signature=(
114
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
115
- tf.TensorSpec(shape=(max_len,), dtype=tf.int32)
116
- )
117
  )
118
 
119
- dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
 
 
 
120
 
121
- print("โœ… TF Dataset ์ƒ์„ฑ ์™„๋ฃŒ!")
122
 
123
  class Lo(layers.Layer):
124
  def __init__(self, d_model):
 
1
+ pip install sentencepiece
2
+
3
+ import sentencepiece as spm
4
+
5
+ import os, json, numpy as np, tensorflow as tf
6
+
7
+ from tensorflow.keras import layers, Model
8
+
9
  import requests
10
 
11
+ from tensorflow import keras
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ from tensorflow.keras import layers
 
14
 
15
+ import tensorflow.keras.backend as K
 
 
16
 
 
 
17
 
 
 
 
18
 
19
+ print('1')
 
 
20
 
 
 
 
21
 
 
 
 
22
 
23
+ tf.get_logger().setLevel("ERROR")
 
24
 
25
+ SEED = 42
 
26
 
27
+ tf.random.set_seed(SEED)
 
28
 
29
+ np.random.seed(SEED)
 
 
 
30
 
 
31
 
 
 
32
 
33
+ # TPU ์ดˆ๊ธฐํ™”
34
+
35
+ try:
36
+ resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
37
+ tf.tpu.experimental.initialize_tpu_system(resolver)
38
+ strategy = tf.distribute.TPUStrategy(resolver)
39
+ print("โœ… TPU ์ดˆ๊ธฐํ™” ์™„๋ฃŒ:", resolver.cluster_spec().as_dict())
40
+ on_tpu = True
41
+
42
+ except Exception as e:
43
+ print("โš ๏ธ TPU ๋ฏธ์‚ฌ์šฉ, GPU/CPU๋กœ ์ง„ํ–‰:", e)
44
+ strategy = tf.distribute.get_strategy()
45
+ on_tpu = False
46
+
47
+ # Mixed precision
48
+ from tensorflow.keras import mixed_precision
49
+ policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
50
+ mixed_precision.set_global_policy(policy)
51
+ print("โœ… Mixed precision:", policy)
52
+
53
+ # =======================
54
+ # 1) ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
55
+ # =======================
56
+
57
+ def download_file(url, save_path):
58
+ r = requests.get(url, stream=True)
59
+ r.raise_for_status()
60
+ with open(save_path, "wb") as f:
61
+ for chunk in r.iter_content(8192*2):
62
+ f.write(chunk)
63
+ print(f"โœ… {save_path} ์ €์žฅ๋จ")
64
+
65
+ DATA_PATH = "converted.jsonl"
66
+ TOKENIZER_PATH = "ko_unigram.model"
67
+
68
+ if not os.path.exists(DATA_PATH):
69
+ download_file(
70
+ "https://huggingface.co/datasets/Yuchan5386/SFT/resolve/main/data_shuffled_1.jsonl?download=true",
71
+ DATA_PATH
72
+ )
73
+
74
+ if not os.path.exists(TOKENIZER_PATH):
75
+ download_file(
76
+ "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
77
+ TOKENIZER_PATH
78
+ )
79
+
80
+ sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
81
+
82
+ pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
83
+ start_id = sp.piece_to_id("<start>")
84
+ sep_id = sp.piece_to_id("<sep>")
85
+ end_id = sp.piece_to_id("<end>")
86
+ unk_id = sp.piece_to_id("<unk>")
87
+ vocab_size = sp.get_piece_size()
88
+ print(f"โœ… Vocabulary size: {vocab_size}")
89
 
90
+ max_len = 200
91
+ batch_size = 128
92
+
93
+ def text_to_ids(text):
94
+ return sp.encode(text, out_type=int)
95
+
96
+ def ids_to_text(ids):
97
+ return sp.decode(ids)
98
 
 
 
 
99
 
100
+ def jsonl_stream(file_path):
101
+ with open(file_path, "r", encoding="utf-8") as f:
102
+ for line in f:
103
+ data = json.loads(line)
104
+ conversations = data.get("conversations", [])
105
+ for i in range(0, len(conversations) - 1, 2):
106
+ human_msg = conversations[i]
107
+ gpt_msg = conversations[i + 1]
108
+ if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
109
+ continue
110
+
111
+ prompt = human_msg.get("value", "").strip()
112
+ response = gpt_msg.get("value", "").strip()
113
+ full = f"<start> {prompt} <sep> {response} <end>"
114
+ if "<sep>" not in full:
115
+ continue
116
+
117
+ sep_index = full.index("<sep>")
118
+ input_text = full[:sep_index + len("<sep>")].strip()
119
+ target_text = full[sep_index + len("<sep>"):].strip()
120
+ input_ids = text_to_ids(input_text)
121
+ target_ids = text_to_ids(target_text + " <end>")
122
+ available_len = max_len - len(input_ids)
123
+
124
+ if available_len <= 0:
125
+ input_ids = input_ids[-max_len:]
126
+ target_ids = []
127
+ target_mask = [0] * len(input_ids)
128
+ else:
129
+ target_ids = target_ids[:available_len]
130
+ target_mask = [0] * len(input_ids) + [1] * len(target_ids)
131
+
132
+ full_input = input_ids + target_ids
133
+ pad_len = max_len - len(full_input)
134
+ full_input += [pad_id] * pad_len
135
+ target_mask += [0] * pad_len
136
+ target_seq = full_input[1:] + [end_id]
137
+ target_seq = target_seq[:max_len]
138
+ masked_target = [
139
+ t if m == 1 else pad_id
140
+ for t, m in zip(target_seq, target_mask)
141
+ ]
142
+ yield (
143
+ tf.convert_to_tensor(full_input, dtype=tf.int32),
144
+ tf.convert_to_tensor(masked_target, dtype=tf.int32)
145
+ )
146
 
147
  dataset = tf.data.Dataset.from_generator(
148
+ lambda: jsonl_stream(DATA_PATH),
149
  output_signature=(
150
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
151
+ tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
152
+ ),
153
  )
154
 
155
+ dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
156
+
157
+ with strategy.scope():
158
+ dist_dataset = strategy.experimental_distribute_dataset(dataset)
159
 
 
160
 
161
  class Lo(layers.Layer):
162
  def __init__(self, d_model):