| import logging |
| import os |
| import docx |
| import PyPDF2 |
| from docx.shared import RGBColor, Pt |
| from io import BytesIO, IOBase |
| import tempfile |
| import re |
| import datetime |
| import torch |
|
|
| import gradio as gr |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import huggingface_hub |
|
|
| |
| |
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(name)s - %(message)s" |
| ) |
| logger = logging.getLogger("LLM-Legal-App") |
|
|
| |
| |
| |
| def initialize_model(): |
| """Initialize the phi-2 model and tokenizer from HuggingFace.""" |
| logger.info("Initializing phi-2 model and tokenizer...") |
| try: |
| |
| |
| |
| model_name = "microsoft/phi-2" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| logger.info("Successfully initialized phi-2 model and tokenizer.") |
| return model, tokenizer |
| except Exception as e: |
| logger.exception("Error initializing Hugging Face model.") |
| raise ValueError(f"Failed to initialize model: {e}") |
|
|
| |
| model, tokenizer = initialize_model() |
|
|
| |
| |
| |
| def generate_with_model(prompt, max_length=1400, temperature=0.3): |
| """Generate text using the Hugging Face model.""" |
| logger.info("Generating text with phi-2 model.") |
| |
| try: |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| |
| |
| generation_config = { |
| "max_new_tokens": max_length, |
| "temperature": temperature, |
| "top_p": 0.9, |
| "do_sample": temperature > 0, |
| "pad_token_id": tokenizer.eos_token_id |
| } |
| |
| with torch.no_grad(): |
| outputs = model.generate(**inputs, **generation_config) |
| |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
| |
| if response.startswith(prompt): |
| response = response[len(prompt):].strip() |
| |
| logger.info("Text generation complete.") |
| return response |
| |
| except Exception as e: |
| logger.exception("Error during text generation.") |
| return f"Error generating text: {e}" |
|
|
| def generate_legal_document(doc_type, party_a, party_b, context, country): |
| """ |
| Uses DocumentCogito to generate a legal document. Returns the document text. |
| """ |
| logger.info(f"Starting generation for doc_type={doc_type!r}.") |
| |
| party_a = party_a if party_a else "[Party A Not Provided]" |
| party_b = party_b if party_b else "[Party B Not Provided]" |
| context = context if context else "[Context Not Provided]" |
|
|
| prompt = f""" |
| You are a helpful legal assistant. Generate a {doc_type} for: |
| 1) {party_a} |
| 2) {party_b} |
| |
| Context/brief of the agreement: |
| {context}. |
| |
| The document should include: |
| - Purpose of the {doc_type} |
| - Responsibilities and obligations of each party |
| - Confidentiality terms |
| - Payment terms (use [To Be Determined] if not specified) |
| - Term (duration) and termination |
| - Governing law: {country} |
| - Jurisdiction: [Appropriate region in {country} if not provided] |
| - Signature blocks |
| |
| Use formal language, but keep it relatively clear and readable. |
| For any missing information, use placeholders like [To Be Determined]. |
| Include a disclaimer that this is a draft and not legally binding until reviewed and signed. |
| """ |
| logger.debug(f"Generated prompt:\n{prompt}") |
|
|
| return generate_with_model(prompt, max_length=1400, temperature=0.3) |
|
|
| def review_legal_document(doc_text, doc_type, party_a, party_b): |
| """ |
| Reviews document: first with rule-based checks, then wording analysis. |
| """ |
| logger.info("Starting document review (rule-based and wording).") |
|
|
| |
| rule_based_prompt = f""" |
| You are a legal AI assistant reviewing a document. Provide a review, |
| structured into the following numbered sections. Be concise and factual. Do NOT |
| use Markdown. Use plain text labels for each section. |
| |
| Document text: |
| \"\"\" |
| {doc_text} |
| \"\"\" |
| |
| Review Sections: |
| |
| 1) Parties and Authority: |
| - Confirm the full legal names of all parties. |
| - Make sure the people signing can legally commit their organizations. |
| |
| 2) Scope of Work / Obligations: |
| - Check that the contract clearly describes what each side must do. |
| - Look for deadlines, milestones, or deliverables. |
| - Ensure everything is realistic and not overly vague. |
| |
| 3) Definitions and Key Terms: |
| - See if there's a section that explains important terms. |
| - Ensure those terms are used the same way throughout the contract. |
| - Avoid or clarify any ambiguous language. |
| |
| 4) Payment Terms (If Applicable): |
| - Check how much is owed, the currency, and when it's due. |
| - Look for penalties, interest, or late fees. |
| - Note how and when invoices are sent or paid. |
| |
| 5) Term and Termination: |
| - Identify when the contract starts and ends. |
| - Understand how it can be renewed. |
| - See the conditions and notice required for ending the contract early. |
| |
| 6) Intellectual Property (IP) Rights: |
| - Confirm who owns any work created under the agreement. |
| - Note if licenses are granted for using the IP, and for how long. |
| |
| 7) Confidentiality and Privacy: |
| - Check what is considered confidential information. |
| - Look for exceptions (like already public info). |
| - See how long the confidentiality rules apply. |
| |
| 8) Warranties and Representations: |
| - Note any performance guarantees or quality promises. |
| - Look for disclaimers (like "as is" clauses). |
| |
| 9) Indemnification: |
| - See who will pay legal costs or damages if there's a lawsuit or claim. |
| - Check any limits on what's covered. |
| |
| 10) Limitation of Liability: |
| - Check if there's a maximum amount one side can claim in damages. |
| - Look for excluded damages, like lost profits. |
| |
| 11) Dispute Resolution and Governing Law: |
| - See if disputes go to arbitration, mediation, or court. |
| - Note which state or country's laws will apply. |
| |
| 12) Force Majeure (Unforeseen Events): |
| - Look for events like natural disasters or war that could suspend obligations. |
| - See if there are notice requirements for these events. |
| |
| 13) Notices and Amendments: |
| - Check how official notices must be sent (email, mail, etc.). |
| - Find out how to properly change the contract (in writing, signatures, etc.). |
| |
| 14) Entire Agreement and Severability: |
| - Confirm that this contract replaces all previous agreements. |
| - Ensure that if one clause is invalid, the rest still stands. |
| |
| 15) Signatures and Dates: |
| - Make sure the right people sign in their proper roles. |
| - Verify the date of signature and when the contract goes into effect. |
| |
| 16) Ambiguities, Contradictions, and Hidden Clauses: |
| - Watch for contradictory statements or clauses that conflict. |
| - Beware of vague phrases like "best efforts" without clear guidelines. |
| - Check for hidden or "buried" clauses in fine print or attachments. |
| |
| 17) Compliance and Regulatory Alignment: |
| - Ensure the contract follows relevant laws and rules. |
| - Check for industry-specific requirements. |
| |
| 18) Practical Considerations: |
| - Make sure deadlines and other requirements are doable. |
| - Confirm all negotiations are reflected in writing. |
| - Avoid blank or undefined items (like fees or dates "to be decided"). |
| """ |
| logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}") |
|
|
| try: |
| rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3) |
| except Exception as e: |
| logger.exception("Error during rule-based review.") |
| return f"Error during rule-based review: {e}" |
|
|
| |
| wording_analysis_prompt = f""" |
| You are a legal AI assistant. Analyze the following legal document for its wording: |
| |
| Document text: |
| \"\"\" |
| {doc_text} |
| \"\"\" |
| |
| Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text: |
| |
| 1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements. |
| 2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity. |
| 3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed. |
| 4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies. |
| 5. **Redundancy:** Identify any unnecessary repetition of words or phrases. |
| 6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate. |
| 7. **Overall Recommendations:** Give overall recommendations for improving the document's wording. |
| |
| Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.). |
| """ |
| logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}") |
|
|
| try: |
| wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3) |
| except Exception as e: |
| logger.exception("Error during wording analysis.") |
| return f"Error during wording analysis: {e}" |
|
|
| combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}" |
| return combined_review |
|
|
| |
| |
| |
|
|
| def parse_bytesio(file_data: BytesIO) -> str: |
| """Parses a BytesIO object representing a PDF or DOCX.""" |
| logger.info("Parsing BytesIO object...") |
| try: |
| |
| try: |
| doc_obj = docx.Document(file_data) |
| return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
| except docx.opc.exceptions.PackageNotFoundError: |
| logger.info("BytesIO is not DOCX, trying PDF.") |
| file_data.seek(0) |
| try: |
| pdf_reader = PyPDF2.PdfReader(file_data) |
| return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
| except Exception as e: |
| logger.exception(f"Error parsing BytesIO as PDF: {e}") |
| return f"Error parsing BytesIO as PDF: {e}" |
| except Exception as e: |
| logger.exception(f"Error processing BytesIO: {e}") |
| return f"Error processing file content: {e}" |
| except Exception as e: |
| logger.exception(f"Error parsing BytesIO: {e}") |
| return f"Error parsing BytesIO: {e}" |
|
|
| def parse_uploaded_file_path(file_data) -> str: |
| """Takes file data, determines type, extracts text.""" |
| if not file_data: |
| logger.warning("No file provided.") |
| return "" |
| if isinstance(file_data, str): |
| file_path = file_data |
| logger.info(f"Received filepath: {file_path}") |
| elif isinstance(file_data, dict) and 'name' in file_data: |
| file_path = file_data['name'] |
| logger.info(f"Received file object with name: {file_path}") |
| elif isinstance(file_data, (BytesIO, IOBase)): |
| return parse_bytesio(file_data) |
| else: |
| logger.error(f"Unexpected file_data type: {type(file_data)}") |
| return "Error: Unexpected file data format." |
|
|
| logger.info(f"Attempting to parse file at {file_path}") |
| try: |
| _, ext = os.path.splitext(file_path) |
| ext = ext.lower() |
| if ext == ".pdf": |
| with open(file_path, "rb") as f: |
| pdf_reader = PyPDF2.PdfReader(f) |
| return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
| elif ext == ".docx": |
| doc_obj = docx.Document(file_path) |
| return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
| else: |
| return "Unsupported file format." |
| except Exception as e: |
| logger.exception(f"Error parsing file: {e}") |
| return f"Error parsing file: {e}" |
| finally: |
| pass |
|
|
| |
| |
| |
|
|
| def clean_markdown(text): |
| """Removes common Markdown formatting.""" |
| if not text: return "" |
| text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) |
| text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text) |
| text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text) |
| text = re.sub(r'^[\-\+\*]\s+', '', text, flags=re.MULTILINE) |
| text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) |
| text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE) |
| text = re.sub(r'!\[(.*?)\]\((.*?)\)', '', text) |
| text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) |
| return text.strip() |
|
|
| def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"): |
| """Creates DOCX, adds review, saves to temp file, returns path.""" |
| logger.debug("Creating and saving DOCX.") |
| document = docx.Document() |
|
|
| now = datetime.datetime.now() |
| timestamp = now.strftime("%Y%m%d_%H%M%S") |
| file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx" |
|
|
| title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}" |
| document.add_heading(title, level=1) |
|
|
| if doc_text: |
| document.add_heading("Generated Document", level=2) |
| for para in clean_markdown(doc_text).split("\n"): |
| document.add_paragraph(para) |
|
|
| if review_text: |
| document.add_heading("LLM Review", level=2) |
| for section in review_text.split("\n\n"): |
| if section.startswith("Rule-Based Analysis:"): |
| analysis_heading = document.add_paragraph() |
| analysis_run = analysis_heading.add_run("Rule-Based Analysis") |
| analysis_run.font.size = Pt(14) |
| analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| for para in section[len("Rule-Based Analysis:"):].split("\n"): |
| if re.match(r"^\d+\)", para): |
| p = document.add_paragraph(style='List Number') |
| p.add_run(para).font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| else: |
| document.add_paragraph(para) |
|
|
| elif section.startswith("Wording Analysis:"): |
| analysis_heading = document.add_paragraph() |
| analysis_run = analysis_heading.add_run("Wording Analysis") |
| analysis_run.font.size = Pt(14) |
| analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| for para in section[len("Wording Analysis:"):].split("\n"): |
| document.add_paragraph(para) |
| else: |
| document.add_paragraph(section) |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile: |
| document.save(tmpfile.name) |
| logger.debug(f"DOCX saved to: {tmpfile.name}") |
| return tmpfile.name |
|
|
| |
| |
| |
|
|
| def generate_document_interface(doc_type, party_a, party_b, context, country): |
| """Handles document generation.""" |
| logger.info(f"User requested doc generation: {doc_type}, {country}") |
| doc_text = generate_legal_document(doc_type, party_a, party_b, context, country) |
| if doc_text.startswith("Error"): |
| return doc_text, None |
| docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b) |
| return doc_text, docx_file_path |
|
|
| def review_document_interface(file_data, doc_type, party_a, party_b): |
| """Handles document review.""" |
| logger.info("User requested review.") |
| if not file_data: |
| return "No file uploaded.", None |
|
|
| original_text = parse_uploaded_file_path(file_data) |
| if original_text.startswith("Error") or original_text.startswith("Unsupported"): |
| return original_text, None |
|
|
| review_text = review_legal_document(original_text, doc_type, party_a, party_b) |
| if review_text.startswith("Error"): |
| return review_text, None |
|
|
| docx_file_path = create_and_save_docx(None, review_text, doc_type, party_a, party_b) |
| return review_text, docx_file_path |
|
|
| |
| |
| |
| |
| custom_css = """ |
| .tab-one { |
| background-color: #D1EEFC; /* Light blue */ |
| color: #333; |
| } |
| .tab-two { |
| background-color: #FCEED1; /* Light orange */ |
| color: #333; |
| } |
| /* If you want to style the tab label differently, you may need to target |
| specific child elements (like a .tab__header) within the class. */ |
| """ |
|
|
| def build_app(): |
| with gr.Blocks(css=custom_css) as demo: |
| gr.Markdown( |
| """ |
| # UST Global Legal Document Analyzer (Hugging Face Version) |
| |
| **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis. |
| |
| **Disclaimer**: This tool provides assistance but is not a substitute for professional legal advice. |
| """ |
| ) |
| with gr.Tabs(selected=1): |
| with gr.Tab("Generate Document", visible=False): |
| doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU") |
| party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC") |
| party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp") |
| context = gr.Textbox(label="Context/Brief", placeholder="Short summary of the agreement...") |
| country = gr.Dropdown(label="Governing Law (Country)", choices=["India", "Malaysia", "US", "UK", "Singapore", "Japan"], value="India") |
| gen_button = gr.Button("Generate Document") |
| gen_output_text = gr.Textbox(label="Generated Document", lines=15, placeholder="Generated document will appear here...") |
| gen_output_file = gr.File(label="Download DOCX", type="filepath") |
| gen_button.click( |
| generate_document_interface, |
| inputs=[doc_type, party_a, party_b, context, country], |
| outputs=[gen_output_text, gen_output_file] |
| ) |
|
|
| with gr.Tab("Review Document", elem_classes="tab-one", id=1): |
| |
| doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False) |
| party_a_review = gr.Textbox(label="Party A Name", visible=False) |
| party_b_review = gr.Textbox(label="Party B Name", visible=False) |
|
|
| file_input = gr.File(label="Upload PDF/DOCX for Review", type="filepath") |
| review_button = gr.Button("Review Document") |
| review_output_text = gr.Textbox(label="Review", lines=15, placeholder="Review will appear here...") |
| review_output_file = gr.File(label="Download Reviewed DOCX", type="filepath") |
| review_button.click( |
| review_document_interface, |
| inputs=[file_input, doc_type_review, party_a_review, party_b_review], |
| outputs=[review_output_text, review_output_file] |
| ) |
| |
| gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review]) |
|
|
| gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.") |
| return demo |
|
|
| |
| if __name__ == "__main__": |
| |
| logger.info("Initializing Gradio interface...") |
| demo = build_app() |
| logger.info("Launching Gradio app.") |
| demo.launch(debug=True,share=False) |