| |
| import os |
|
|
| |
| DATA_DIR = "data/processed" |
| TEST_DIR = "data/test_sets" |
| SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne") |
| TARGET_FILE = os.path.join(DATA_DIR, "nepali.en") |
| NUM_TEST_LINES = 500 |
| |
|
|
| print("--- Creating a held-back test set for Nepali ---") |
| os.makedirs(TEST_DIR, exist_ok=True) |
|
|
| |
| with open(SOURCE_FILE, "r", encoding="utf-8") as f: |
| source_lines = f.readlines() |
| with open(TARGET_FILE, "r", encoding="utf-8") as f: |
| target_lines = f.readlines() |
|
|
| |
| assert len(source_lines) == len(target_lines), "Source and target files have different lengths!" |
|
|
| |
| train_source_lines = source_lines[:-NUM_TEST_LINES] |
| test_source_lines = source_lines[-NUM_TEST_LINES:] |
|
|
| train_target_lines = target_lines[:-NUM_TEST_LINES] |
| test_target_lines = target_lines[-NUM_TEST_LINES:] |
|
|
| |
| with open(SOURCE_FILE, "w", encoding="utf-8") as f: |
| f.writelines(train_source_lines) |
| with open(TARGET_FILE, "w", encoding="utf-8") as f: |
| f.writelines(train_target_lines) |
|
|
| |
| with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f: |
| f.writelines(test_source_lines) |
| with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f: |
| f.writelines(test_target_lines) |
|
|
| print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.") |
| print(f"The original training files in '{DATA_DIR}' have been updated.") |
|
|