Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,9 +15,9 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 15 |
app = Flask(__name__)
|
| 16 |
CORS(app)
|
| 17 |
UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
|
| 18 |
-
PEGASUS_MODEL_DIR = 'fine_tuned_pegasus'
|
| 19 |
-
BERT_MODEL_DIR = 'fine_tuned_bert'
|
| 20 |
-
LEGALBERT_MODEL_DIR = 'fine_tuned_legalbert'
|
| 21 |
MAX_FILE_SIZE = 100 * 1024 * 1024
|
| 22 |
|
| 23 |
if not os.path.exists(UPLOAD_FOLDER):
|
|
@@ -37,13 +37,11 @@ def load_or_finetune_pegasus():
|
|
| 37 |
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
|
| 38 |
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
|
| 39 |
|
| 40 |
-
# Load and normalize datasets
|
| 41 |
cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]").rename_column("article", "text").rename_column("highlights", "summary")
|
| 42 |
xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True).rename_column("document", "text")
|
| 43 |
combined_dataset = concatenate_datasets([cnn_dm, xsum])
|
| 44 |
|
| 45 |
def preprocess_function(examples):
|
| 46 |
-
# Directly use normalized 'text' and 'summary' fields
|
| 47 |
inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
|
| 48 |
targets = tokenizer(examples["summary"], max_length=400, truncation=True, padding="max_length", return_tensors="pt")
|
| 49 |
inputs["labels"] = targets["input_ids"]
|
|
@@ -54,7 +52,7 @@ def load_or_finetune_pegasus():
|
|
| 54 |
eval_dataset = tokenized_dataset.select(range(8000, 10000))
|
| 55 |
|
| 56 |
training_args = TrainingArguments(
|
| 57 |
-
output_dir="
|
| 58 |
num_train_epochs=3,
|
| 59 |
per_device_train_batch_size=1,
|
| 60 |
per_device_eval_batch_size=1,
|
|
@@ -117,7 +115,7 @@ def load_or_finetune_bert():
|
|
| 117 |
eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
|
| 118 |
|
| 119 |
training_args = TrainingArguments(
|
| 120 |
-
output_dir="
|
| 121 |
num_train_epochs=3,
|
| 122 |
per_device_train_batch_size=8,
|
| 123 |
per_device_eval_batch_size=8,
|
|
@@ -180,7 +178,7 @@ def load_or_finetune_legalbert():
|
|
| 180 |
eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
|
| 181 |
|
| 182 |
training_args = TrainingArguments(
|
| 183 |
-
output_dir="
|
| 184 |
num_train_epochs=3,
|
| 185 |
per_device_train_batch_size=8,
|
| 186 |
per_device_eval_batch_size=8,
|
|
|
|
| 15 |
app = Flask(__name__)
|
| 16 |
CORS(app)
|
| 17 |
UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
|
| 18 |
+
PEGASUS_MODEL_DIR = '/app/fine_tuned_pegasus'
|
| 19 |
+
BERT_MODEL_DIR = '/app/fine_tuned_bert'
|
| 20 |
+
LEGALBERT_MODEL_DIR = '/app/fine_tuned_legalbert'
|
| 21 |
MAX_FILE_SIZE = 100 * 1024 * 1024
|
| 22 |
|
| 23 |
if not os.path.exists(UPLOAD_FOLDER):
|
|
|
|
| 37 |
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
|
| 38 |
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
|
| 39 |
|
|
|
|
| 40 |
cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]").rename_column("article", "text").rename_column("highlights", "summary")
|
| 41 |
xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True).rename_column("document", "text")
|
| 42 |
combined_dataset = concatenate_datasets([cnn_dm, xsum])
|
| 43 |
|
| 44 |
def preprocess_function(examples):
|
|
|
|
| 45 |
inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
|
| 46 |
targets = tokenizer(examples["summary"], max_length=400, truncation=True, padding="max_length", return_tensors="pt")
|
| 47 |
inputs["labels"] = targets["input_ids"]
|
|
|
|
| 52 |
eval_dataset = tokenized_dataset.select(range(8000, 10000))
|
| 53 |
|
| 54 |
training_args = TrainingArguments(
|
| 55 |
+
output_dir="/app/pegasus_finetune",
|
| 56 |
num_train_epochs=3,
|
| 57 |
per_device_train_batch_size=1,
|
| 58 |
per_device_eval_batch_size=1,
|
|
|
|
| 115 |
eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
|
| 116 |
|
| 117 |
training_args = TrainingArguments(
|
| 118 |
+
output_dir="/app/bert_finetune",
|
| 119 |
num_train_epochs=3,
|
| 120 |
per_device_train_batch_size=8,
|
| 121 |
per_device_eval_batch_size=8,
|
|
|
|
| 178 |
eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
|
| 179 |
|
| 180 |
training_args = TrainingArguments(
|
| 181 |
+
output_dir="/app/legalbert_finetune",
|
| 182 |
num_train_epochs=3,
|
| 183 |
per_device_train_batch_size=8,
|
| 184 |
per_device_eval_batch_size=8,
|