Dyno1307
/

translator

Model card Files Files and versions

translator / scripts /create_test_set.py

Dyno1307's picture

Upload 67 files

e599a18 verified 5 months ago

history blame contribute delete

1.58 kB

	# scripts/create_test_set.py
	import os

	# --- Configuration ---
	DATA_DIR = "data/processed"
	TEST_DIR = "data/test_sets"
	SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne")
	TARGET_FILE = os.path.join(DATA_DIR, "nepali.en")
	NUM_TEST_LINES = 500
	# ---

	print("--- Creating a held-back test set for Nepali ---")
	os.makedirs(TEST_DIR, exist_ok=True)

	# Read all lines from the original files
	with open(SOURCE_FILE, "r", encoding="utf-8") as f:
	source_lines = f.readlines()
	with open(TARGET_FILE, "r", encoding="utf-8") as f:
	target_lines = f.readlines()

	# Ensure the files have the same number of lines
	assert len(source_lines) == len(target_lines), "Source and target files have different lengths!"

	# Split the data
	train_source_lines = source_lines[:-NUM_TEST_LINES]
	test_source_lines = source_lines[-NUM_TEST_LINES:]

	train_target_lines = target_lines[:-NUM_TEST_LINES]
	test_target_lines = target_lines[-NUM_TEST_LINES:]

	# Write the new, smaller training files (overwriting the old ones)
	with open(SOURCE_FILE, "w", encoding="utf-8") as f:
	f.writelines(train_source_lines)
	with open(TARGET_FILE, "w", encoding="utf-8") as f:
	f.writelines(train_target_lines)

	# Write the new test files
	with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f:
	f.writelines(test_source_lines)
	with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f:
	f.writelines(test_target_lines)

	print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.")
	print(f"The original training files in '{DATA_DIR}' have been updated.")