Yuchan commited on
Commit
0094083
·
verified ·
1 Parent(s): 9b34589

Update Model.py

Browse files
Files changed (1) hide show
  1. Model.py +25 -51
Model.py CHANGED
@@ -43,7 +43,7 @@ def download_file(url, save_path):
43
  f.write(chunk)
44
  print(f"✅ {save_path} 저장됨")
45
 
46
- DATA_PATH = "converted.jsonl"
47
  TOKENIZER_PATH = "ko_unigram.model"
48
 
49
  if not os.path.exists(DATA_PATH):
@@ -77,63 +77,37 @@ def text_to_ids(text):
77
  def ids_to_text(ids):
78
  return sp.decode(ids)
79
 
80
-
81
- def jsonl_stream(file_path):
82
  with open(file_path, "r", encoding="utf-8") as f:
83
  for line in f:
84
- data = json.loads(line)
85
- conversations = data.get("conversations", [])
86
- for i in range(0, len(conversations) - 1, 2):
87
- human_msg = conversations[i]
88
- gpt_msg = conversations[i + 1]
89
- if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
90
- continue
91
-
92
- prompt = human_msg.get("value", "").strip()
93
- response = gpt_msg.get("value", "").strip()
94
- full = f"<start> {prompt} <sep> {response} <end>"
95
- if "<sep>" not in full:
96
- continue
97
-
98
- sep_index = full.index("<sep>")
99
- input_text = full[:sep_index + len("<sep>")].strip()
100
- target_text = full[sep_index + len("<sep>"):].strip()
101
- input_ids = text_to_ids(input_text)
102
- target_ids = text_to_ids(target_text + " <end>")
103
- available_len = max_len - len(input_ids)
104
-
105
- if available_len <= 0:
106
- input_ids = input_ids[-max_len:]
107
- target_ids = []
108
- target_mask = [0] * len(input_ids)
109
- else:
110
- target_ids = target_ids[:available_len]
111
- target_mask = [0] * len(input_ids) + [1] * len(target_ids)
112
-
113
- full_input = input_ids + target_ids
114
- pad_len = max_len - len(full_input)
115
- full_input += [pad_id] * pad_len
116
- target_mask += [0] * pad_len
117
- target_seq = full_input[1:] + [end_id]
118
- target_seq = target_seq[:max_len]
119
- masked_target = [
120
- t if m == 1 else pad_id
121
- for t, m in zip(target_seq, target_mask)
122
- ]
123
- yield (
124
- tf.convert_to_tensor(full_input, dtype=tf.int32),
125
- tf.convert_to_tensor(masked_target, dtype=tf.int32)
126
- )
127
 
128
  dataset = tf.data.Dataset.from_generator(
129
- lambda: jsonl_stream(DATA_PATH),
130
  output_signature=(
131
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
132
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
133
- ),
134
  )
135
 
136
- dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
137
 
138
  with strategy.scope():
139
  dist_dataset = strategy.experimental_distribute_dataset(dataset)
@@ -367,7 +341,7 @@ model.save_weights("Cobra.weights.h5")
367
  print("모델 가중치 저장 완료!")
368
 
369
  def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
370
- model_input = text_to_ids(f"<start> {prompt} <sep>")
371
  model_input = model_input[:max_len]
372
  generated = list(model_input)
373
  for step in range(max_gen):
@@ -396,4 +370,4 @@ def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperatu
396
  return ids_to_text(generated)
397
 
398
  print("\n\n===== 생성 결과 =====")
399
- print(generate_text_topp(model, "안녕", p=0.9))
 
43
  f.write(chunk)
44
  print(f"✅ {save_path} 저장됨")
45
 
46
+ DATA_PATH = "corpus.txt"
47
  TOKENIZER_PATH = "ko_unigram.model"
48
 
49
  if not os.path.exists(DATA_PATH):
 
77
  def ids_to_text(ids):
78
  return sp.decode(ids)
79
 
80
+ def txt_stream(file_path):
 
81
  with open(file_path, "r", encoding="utf-8") as f:
82
  for line in f:
83
+ text = line.strip()
84
+ if not text:
85
+ continue
86
+
87
+ ids = text_to_ids(text)
88
+ ids = ids[:max_len - 1] # 마지막에 <end> 넣기 위해 -1
89
+
90
+ full_input = ids + [end_id]
91
+ pad_len = max_len - len(full_input)
92
+ full_input += [pad_id] * pad_len
93
+
94
+ # target = next-token shifted sequence
95
+ target = full_input[1:] + [pad_id]
96
+ yield (
97
+ tf.convert_to_tensor(full_input, dtype=tf.int32),
98
+ tf.convert_to_tensor(target, dtype=tf.int32)
99
+ )
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  dataset = tf.data.Dataset.from_generator(
103
+ lambda: txt_stream(DATA_PATH),
104
  output_signature=(
105
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
106
  tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
107
+ )
108
  )
109
 
110
+ dataset = dataset.shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
111
 
112
  with strategy.scope():
113
  dist_dataset = strategy.experimental_distribute_dataset(dataset)
 
341
  print("모델 가중치 저장 완료!")
342
 
343
  def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
344
+ model_input = text_to_ids(f"<start> {prompt}")
345
  model_input = model_input[:max_len]
346
  generated = list(model_input)
347
  for step in range(max_gen):
 
370
  return ids_to_text(generated)
371
 
372
  print("\n\n===== 생성 결과 =====")
373
+ print(generate_text_topp(model, "", p=0.9))