| | |
| |
|
| | from datasets import load_dataset |
| | import os |
| |
|
| |
|
| | def fetch_and_save_parallel_data(lang_pair, dataset_name, output_name): |
| | """ |
| | Downloads a parallel dataset and saves it into two |
| | separate text files (one for each language). |
| | |
| | Args: |
| | lang_pair (str): Language pair, e.g., "en-ne" for English-Nepali. |
| | dataset_name (str): The name of the dataset on Hugging Face Hub. |
| | output_name (str): The name to use for the output files. |
| | """ |
| | source_lang, target_lang = lang_pair.split("-") |
| | output_dir = "data/processed" |
| | os.makedirs(output_dir, exist_ok=True) |
| | |
| | source_filepath = os.path.join(output_dir, f"{output_name}.{source_lang}") |
| | target_filepath = os.path.join(output_dir, f"{output_name}.{target_lang}") |
| |
|
| | print(f"--- Starting download for {lang_pair} from {dataset_name} ---") |
| | |
| | try: |
| | |
| | if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation": |
| | dataset = load_dataset(dataset_name, split='train') |
| | else: |
| | dataset = load_dataset(dataset_name, lang_pair, split='train') |
| | print(f"Dataset loaded successfully. Total pairs: {len(dataset)}") |
| |
|
| | print(f"Processing and saving files...") |
| | with open(source_filepath, "w", encoding="utf-8") as f_source, \ |
| | open(target_filepath, "w", encoding="utf-8") as f_target: |
| | |
| | for example in dataset: |
| | if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation": |
| | source_sentence = example['Sinhala'] |
| | target_sentence = example['English'] |
| | else: |
| | source_sentence = example['translation'][source_lang] |
| | target_sentence = example['translation'][target_lang] |
| |
|
| | if source_sentence and target_sentence: |
| | f_source.write(source_sentence.strip() + "\n") |
| | f_target.write(target_sentence.strip() + "\n") |
| |
|
| | print(f"Successfully saved data for {lang_pair}") |
| | except Exception as e: |
| | print(f"An error occurred for {lang_pair}: {e}") |
| |
|
| | if __name__ == "__main__": |
| | |
| | print("Fetching Nepali data...") |
| | fetch_and_save_parallel_data(lang_pair="en-ne", dataset_name="Helsinki-NLP/opus-100", output_name="nepali") |
| | |
| | |
| | print("\nFetching Sinhalese data...") |
| | fetch_and_save_parallel_data(lang_pair="si-en", dataset_name="Programmer-RD-AI/sinhala-english-singlish-translation", output_name="sinhala") |
| | |
| |
|
| | |
| | print("\nFetching Sinhalese idioms data...") |
| | output_dir = "data/processed" |
| | try: |
| | idioms_dataset = load_dataset("Venuraa/English-Sinhala-Idioms-Parallel-Translations", split='train') |
| | print(f"Idioms dataset loaded successfully. Total pairs: {len(idioms_dataset)}") |
| |
|
| | with open(os.path.join(output_dir, "sinhala.si"), "a", encoding="utf-8") as f_source, \ |
| | open(os.path.join(output_dir, "sinhala.en"), "a", encoding="utf-8") as f_target: |
| | for example in idioms_dataset: |
| | parts = example['text'].split('\n') |
| | if len(parts) == 2: |
| | f_target.write(parts[0] + "\n") |
| | f_source.write(parts[1] + "\n") |
| | print("Successfully appended idioms data.") |
| | except Exception as e: |
| | print(f"An error occurred while fetching idioms data: {e}") |
| | print("\nAll data fetching complete.") |
| |
|