File size: 1,500 Bytes
18ad9a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from dotenv import load_dotenv
import os

load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
import os
import re

BASE_PATH = os.getcwd()

INPUT_FOLDER = os.path.join(BASE_PATH, "data", "processed")
OUTPUT_FOLDER = os.path.join(BASE_PATH, "data", "clean")

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def clean_text(text):
    # Remove weird unicode characters
    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")

    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Remove lines with only symbols
    text = re.sub(r"[^\w\s.,?!\-–—/]+", "", text)

    # Remove extra newlines
    text = re.sub(r"\n+", "\n", text)

    return text.strip()

def clean_all_files():
    print("Cleaning text files...")
    print("Input:", INPUT_FOLDER)
    print("Output:", OUTPUT_FOLDER)

    for file in os.listdir(INPUT_FOLDER):
        if file.endswith(".txt"):
            in_path = os.path.join(INPUT_FOLDER, file)
            out_path = os.path.join(OUTPUT_FOLDER, file)

            with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
                raw = f.read()

            cleaned = clean_text(raw)

            with open(out_path, "w", encoding="utf-8") as f:
                f.write(cleaned)

            print("Cleaned:", file)

    print("\n✨ Done! Text cleaned successfully.")

if __name__ == "__main__":
    clean_all_files()
    print("done")