code-scrapper / app.py
abidkh's picture
Trying to add option to scrap from link.
457c165
import gradio as gr
import os
import zipfile
import shutil
from pathlib import Path
import git
import re
def is_macos_metadata(rel_path):
return (
rel_path.startswith("__MACOSX") or
"/__MACOSX" in rel_path or
os.path.basename(rel_path).startswith("._") or
os.path.basename(rel_path) == ".DS_Store"
)
def is_git_url(input_str):
# Simple regex to check if input is a Git URL (supports HTTP/HTTPS and SSH)
return bool(re.match(r"^(https?://|git@).*\.git$", input_str))
def clone_git_repo(repo_url, extract_dir):
try:
# Clone the repository to the specified directory
repo = git.Repo.clone_from(repo_url, extract_dir)
return True, None
except Exception as e:
return False, f"Failed to clone repository: {str(e)}"
def process_input(input_source, skip_macos):
if not input_source:
return "Please provide a ZIP file or a Git repository URL.", None
extract_dir = "working_dir/extracted"
output_file = "working_dir/combined_output.txt"
shutil.rmtree("working_dir", ignore_errors=True)
os.makedirs(extract_dir, exist_ok=True)
# Determine if input is a Git URL or a ZIP file
if is_git_url(input_source):
# Handle Git repository
success, error = clone_git_repo(input_source, extract_dir)
if not success:
return error, None
folder_name = input_source.rstrip("/").rstrip(".git").split("/")[-1] + "/"
else:
# Handle ZIP file
if not os.path.exists(input_source):
return "Invalid ZIP file path.", None
zip_path = "working_dir/uploaded.zip"
shutil.copy(input_source, zip_path)
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
except zipfile.BadZipFile:
return "Invalid ZIP file.", None
folder_name = Path(input_source).stem + "/"
tree_lines = []
file_list = []
def build_tree(path, prefix=""):
entries = sorted(os.listdir(path))
for idx, entry in enumerate(entries):
full_path = os.path.join(path, entry)
rel_path = os.path.relpath(full_path, extract_dir)
if skip_macos and is_macos_metadata(rel_path):
continue
if os.path.basename(rel_path) == ".git": # Skip .git directory for repos
continue
connector = "└── " if idx == len(entries) - 1 else "β”œβ”€β”€ "
tree_lines.append(f"{prefix}{connector}{entry}")
if os.path.isdir(full_path):
extension = " " if idx == len(entries) - 1 else "β”‚ "
build_tree(full_path, prefix + extension)
else:
file_list.append((rel_path, full_path))
build_tree(extract_dir)
combined_text = []
combined_text.append(f"Folder {folder_name}")
combined_text.extend(tree_lines)
combined_text.append("")
for i, (rel_path, full_path) in enumerate(file_list, 1):
if skip_macos and is_macos_metadata(rel_path):
continue
combined_text.append(f"{i}. {rel_path}")
combined_text.append(f"-- content of the {rel_path} file --")
try:
with open(full_path, "r", encoding="utf-8") as source_file:
combined_text.append(source_file.read())
except Exception as e:
combined_text.append(f"[Could not read file: {e}]")
combined_text.append("")
final_text = "\n".join(combined_text)
with open(output_file, "w", encoding="utf-8") as f:
f.write(final_text)
return final_text, output_file
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown(" ") # Required to avoid HF title error
with gr.Row():
input_source = gr.Textbox(label="Enter Git Repository URL or Upload ZIP File", placeholder="e.g., https://github.com/user/repo.git")
zip_input = gr.File(file_types=[".zip"], label="Upload ZIP File (optional)")
skip_toggle = gr.Checkbox(label="Skip macOS metadata files (e.g., .DS_Store, __MACOSX)", value=True)
with gr.Row():
process_btn = gr.Button("Generate Output")
status = gr.Textbox(label="Output Text", lines=30, interactive=False)
file_output = gr.File(label="Download Output File")
def handle_inputs(input_text, zip_file, skip_macos):
# Use ZIP file if provided, otherwise use text input
source = zip_file if zip_file else input_text
return process_input(source, skip_macos)
process_btn.click(fn=handle_inputs, inputs=[input_source, zip_input, skip_toggle], outputs=[status, file_output])
if __name__ == "__main__":
demo.launch()