| import requests |
| from subprocess import call |
| import os |
| from pylatexenc.latex2text import LatexNodes2Text |
|
|
| def get_paper(paper_url): |
| if 'abs' in paper_url: |
| eprint_url = paper_url.replace("https://arxiv.org/abs/", "https://arxiv.org/e-print/") |
| elif 'pdf' in paper_url: |
| eprint_url = paper_url.replace("https://arxiv.org/pdf/", "https://arxiv.org/e-print/") |
| else: |
| raise ValueError("Invalid arXiv URL") |
|
|
| suffix = 'paper-dir/' + eprint_url.replace("https://arxiv.org/e-print/", "") |
|
|
| if not os.path.exists("paper-dir"): |
| call(["mkdir", 'paper-dir']) |
|
|
| |
| if os.path.exists(suffix): |
| print("Paper already downloaded, skipping download") |
| else: |
| print("Downloading paper") |
| r = requests.get(eprint_url) |
|
|
| with open("paper", "wb") as f: |
| f.write(r.content) |
|
|
| |
| call(["mkdir", suffix]) |
| call(["tar", "-xzf", "paper", "-C", suffix]) |
|
|
| |
| tex_files = [f for f in os.listdir(suffix) if f.endswith('.tex')] |
| |
| if 'math_commands.tex' in tex_files: |
| tex_files.remove('math_commands.tex') |
| if len(tex_files) == 1: |
| |
| with open(f'{suffix}/{tex_files[0]}', 'r') as f: |
| paper_tex = f.read() |
| elif len(tex_files) == 0: |
| raise ValueError("No .tex files found in the paper") |
| else: |
| raise ValueError("More than one .tex file found in the paper") |
| |
| |
| paper_text = LatexNodes2Text().latex_to_text(paper_tex) |
|
|
| with open(f"{suffix}/main.txt", 'w') as f: |
| f.write(paper_text) |
| |
| return paper_text |
|
|
| if __name__=="__main__": |
| paper_url = "https://arxiv.org/abs/2206.08896" |
| paper_text = get_paper(paper_url) |
| print(paper_text) |