File size: 6,921 Bytes
e207dc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
"""
Script untuk membuat sample dataset JSONL untuk training
"""

import json
import os
from pathlib import Path

def create_sample_dataset():
    """Create sample JSONL dataset"""
    
    # Sample training data
    sample_data = [
        {
            "text": "Apa itu machine learning? Machine learning adalah cabang dari artificial intelligence yang memungkinkan komputer belajar dari data tanpa diprogram secara eksplisit.",
            "category": "education",
            "language": "id"
        },
        {
            "text": "Jelaskan tentang deep learning. Deep learning adalah subset dari machine learning yang menggunakan neural network dengan banyak layer untuk memproses data kompleks.",
            "category": "education", 
            "language": "id"
        },
        {
            "text": "Bagaimana cara kerja neural network? Neural network bekerja dengan menerima input, memproses melalui hidden layers, dan menghasilkan output berdasarkan weights yang telah dilatih.",
            "category": "education",
            "language": "id"
        },
        {
            "text": "Apa keuntungan menggunakan Python untuk AI? Python memiliki library yang lengkap seperti TensorFlow, PyTorch, dan scikit-learn yang memudahkan development AI.",
            "category": "programming",
            "language": "id"
        },
        {
            "text": "Jelaskan tentang transfer learning. Transfer learning adalah teknik menggunakan model yang sudah dilatih pada dataset besar dan mengadaptasinya untuk task yang lebih spesifik.",
            "category": "education",
            "language": "id"
        },
        {
            "text": "Bagaimana cara optimize model machine learning? Optimasi dapat dilakukan dengan hyperparameter tuning, feature engineering, dan menggunakan teknik seperti cross-validation.",
            "category": "optimization",
            "language": "id"
        },
        {
            "text": "Apa itu overfitting? Overfitting terjadi ketika model belajar terlalu detail dari training data sehingga performa pada data baru menurun.",
            "category": "education",
            "language": "id"
        },
        {
            "text": "Jelaskan tentang regularization. Regularization adalah teknik untuk mencegah overfitting dengan menambahkan penalty pada model complexity.",
            "category": "education",
            "language": "id"
        },
        {
            "text": "Bagaimana cara handle imbalanced dataset? Dataset tidak seimbang dapat diatasi dengan teknik sampling, class weights, atau menggunakan metrics yang tepat seperti F1-score.",
            "category": "data_handling",
            "language": "id"
        },
        {
            "text": "Apa itu ensemble learning? Ensemble learning menggabungkan multiple model untuk meningkatkan performa prediksi dan mengurangi variance.",
            "category": "education",
            "language": "id"
        }
    ]
    
    # Create data directory
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    
    # Write to JSONL file
    output_file = data_dir / "training_data.jsonl"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in sample_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    
    print(f"βœ… Sample dataset created: {output_file}")
    print(f"πŸ“Š Total samples: {len(sample_data)}")
    print(f"πŸ“ File size: {output_file.stat().st_size / 1024:.2f} KB")
    
    # Show sample content
    print("\nπŸ“ Sample content:")
    print("-" * 50)
    for i, item in enumerate(sample_data[:3], 1):
        print(f"Sample {i}:")
        print(f"  Text: {item['text'][:100]}...")
        print(f"  Category: {item['category']}")
        print(f"  Language: {item['language']}")
        print()

def create_custom_dataset():
    """Create custom dataset from user input"""
    
    print("πŸ”§ Create Custom Dataset")
    print("=" * 40)
    
    # Get dataset info
    dataset_name = input("Dataset name (without extension): ").strip()
    if not dataset_name:
        dataset_name = "custom_dataset"
    
    num_samples = input("Number of samples (default 10): ").strip()
    try:
        num_samples = int(num_samples) if num_samples else 10
    except ValueError:
        num_samples = 10
    
    print(f"\nπŸ“ Creating {num_samples} samples...")
    print("Format: Enter text for each sample (empty line to finish early)")
    
    custom_data = []
    
    for i in range(num_samples):
        print(f"\nSample {i+1}/{num_samples}:")
        text = input("Text: ").strip()
        
        if not text:
            print("Empty text, finishing...")
            break
        
        category = input("Category (optional): ").strip() or "general"
        language = input("Language (optional, default 'id'): ").strip() or "id"
        
        sample = {
            "text": text,
            "category": category,
            "language": language
        }
        
        custom_data.append(sample)
        
        # Ask if user wants to continue
        if i < num_samples - 1:
            continue_input = input("Continue? (y/n, default y): ").strip().lower()
            if continue_input in ['n', 'no']:
                break
    
    if not custom_data:
        print("❌ No data entered, dataset not created")
        return
    
    # Create data directory
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    
    # Write to JSONL file
    output_file = data_dir / f"{dataset_name}.jsonl"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in custom_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    
    print(f"\nβœ… Custom dataset created: {output_file}")
    print(f"πŸ“Š Total samples: {len(custom_data)}")

def main():
    print("πŸ“Š Dataset Creator for LLM Training")
    print("=" * 50)
    
    print("Pilih opsi:")
    print("1. Create sample dataset (10 samples)")
    print("2. Create custom dataset")
    print("3. View existing datasets")
    
    choice = input("\nPilihan (1-3): ").strip()
    
    if choice == "1":
        create_sample_dataset()
    elif choice == "2":
        create_custom_dataset()
    elif choice == "3":
        data_dir = Path("data")
        if data_dir.exists():
            jsonl_files = list(data_dir.glob("*.jsonl"))
            if jsonl_files:
                print(f"\nπŸ“ Found {len(jsonl_files)} JSONL files:")
                for file in jsonl_files:
                    size = file.stat().st_size / 1024
                    print(f"  - {file.name} ({size:.2f} KB)")
            else:
                print("\nπŸ“ No JSONL files found in data/ directory")
        else:
            print("\nπŸ“ Data directory does not exist")
    else:
        print("❌ Pilihan tidak valid")

if __name__ == "__main__":
    main()