bowphs
/

latin_experiments

Model card Files Files and versions

latin_experiments / data /conllu_split.py

bowphs's picture

Add files using upload-large-folder tool

c34decd verified 12 months ago

history blame contribute delete

1.69 kB

	#!/usr/bin/env python3
	#
	# This file is part of LatinPipe EvaLatin 24
	# <https://github.com/ufal/evalatin2024-latinpipe>.
	#
	# Copyright 2024 Institute of Formal and Applied Linguistics, Faculty of
	# Mathematics and Physics, Charles University in Prague, Czech Republic.
	#
	# This Source Code Form is subject to the terms of the Mozilla Public
	# License, v. 2.0. If a copy of the MPL was not distributed with this
	# file, You can obtain one at http://mozilla.org/MPL/2.0/.
	import argparse

	import numpy as np

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("source", type=str, help="CoNLL-U file to split")
	parser.add_argument("train", type=str, help="CoNLL-U file to write training data to")
	parser.add_argument("dev", type=str, help="CoNLL-U file to write development data to")
	parser.add_argument("--dev_size", type=float, default=0.1, help="Size of the development data")
	args = parser.parse_args()

	sentences = []
	with open(args.source, "r", encoding="utf-8") as source:
	sentence = []
	for line in source:
	sentence.append(line)
	if not line.rstrip("\r\n"):
	sentences.append("".join(sentence))
	sentence = []
	assert not sentence, "Missing empty line after the last sentence"

	dev_indices = set(np.random.RandomState(42).choice(len(sentences), int(len(sentences) * args.dev_size), replace=False))

	with open(args.train, "w", encoding="utf-8") as train:
	with open(args.dev, "w", encoding="utf-8") as dev:
	for i, sentence in enumerate(sentences):
	(dev if i in dev_indices else train).write(sentence)