| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import argparse |
|
|
| import numpy as np |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("source", type=str, help="CoNLL-U file to split") |
| parser.add_argument("train", type=str, help="CoNLL-U file to write training data to") |
| parser.add_argument("dev", type=str, help="CoNLL-U file to write development data to") |
| parser.add_argument("--dev_size", type=float, default=0.1, help="Size of the development data") |
| args = parser.parse_args() |
|
|
| sentences = [] |
| with open(args.source, "r", encoding="utf-8") as source: |
| sentence = [] |
| for line in source: |
| sentence.append(line) |
| if not line.rstrip("\r\n"): |
| sentences.append("".join(sentence)) |
| sentence = [] |
| assert not sentence, "Missing empty line after the last sentence" |
|
|
| dev_indices = set(np.random.RandomState(42).choice(len(sentences), int(len(sentences) * args.dev_size), replace=False)) |
|
|
| with open(args.train, "w", encoding="utf-8") as train: |
| with open(args.dev, "w", encoding="utf-8") as dev: |
| for i, sentence in enumerate(sentences): |
| (dev if i in dev_indices else train).write(sentence) |
|
|