Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on 2 days ago

Commit

b6265c3

0 Parent(s):

Sync: Added functionality to save to S3 and save logs to DynamoDB when using cli_topics

Browse files

Files changed (49) hide show

.dockerignore +27 -0
.gitattributes +1 -0
.github/workflows/ci.yml +196 -0
.github/workflows/simple-test.yml +46 -0
.github/workflows/sync_to_hf.yml +53 -0
.gitignore +22 -0
Dockerfile +166 -0
README.md +176 -0
app.py +0 -0
cli_topics.py +1943 -0
entrypoint.sh +18 -0
example_data/case_note_headers_specific.csv +7 -0
example_data/combined_case_notes.csv +19 -0
example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx +3 -0
example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx +3 -0
example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx +3 -0
example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx +3 -0
example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx +3 -0
example_data/dummy_consultation_response.csv +31 -0
example_data/dummy_consultation_response_themes.csv +26 -0
intros/intro.txt +7 -0
lambda_entrypoint.py +466 -0
load_dynamo_logs.py +102 -0
load_s3_logs.py +93 -0
pyproject.toml +147 -0
requirements.txt +29 -0
requirements_cpu.txt +24 -0
requirements_gpu.txt +28 -0
requirements_lightweight.txt +18 -0
test/README.md +87 -0
test/__init__.py +5 -0
test/mock_inference_server.py +225 -0
test/mock_llm_calls.py +185 -0
test/run_tests.py +34 -0
test/test.py +1067 -0
test/test_gui_only.py +189 -0
tools/__init__.py +0 -0
tools/auth.py +85 -0
tools/aws_functions.py +387 -0
tools/combine_sheets_into_xlsx.py +615 -0
tools/config.py +950 -0
tools/custom_csvlogger.py +333 -0
tools/dedup_summaries.py +0 -0
tools/example_table_outputs.py +94 -0
tools/helper_functions.py +1245 -0
tools/llm_api_call.py +0 -0
tools/llm_funcs.py +1999 -0
tools/prompts.py +260 -0
windows_install_llama-cpp-python.txt +111 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,27 @@

+*.pdf
+*.url
+*.jpg
+*.png
+*.ipynb
+*.xls
+*.xlsx
+examples/*
+output/*
+tools/__pycache__/*
+build/*
+dist/*
+logs/*
+usage/*
+feedback/*
+test_code/*
+test/tmp/*
+unsloth_compiled_cache/*
+.vscode/*
+llm_topic_modelling.egg-info/*
+input/
+output/
+logs/
+usage/
+feedback/
+config/
+tmp/

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.xlsx filter=lfs diff=lfs merge=lfs -text

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,196 @@

+name: CI/CD Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  #schedule:
+  # Run tests daily at 2 AM UTC
+  #  - cron: '0 2 * * *'
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
+  issues: write
+env:
+  PYTHON_VERSION: "3.11"
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff black
+    - name: Run Ruff linter
+      run: ruff check .
+    - name: Run Black formatter check
+      run: black --check .
+  test-unit:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12, 3.13]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Cache pip dependencies
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt', '**/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov pytest-html pytest-xdist
+    - name: Verify example data files
+      run: |
+        echo "Checking if example data directory exists:"
+        ls -la example_data/ || echo "example_data directory not found"
+        echo "Checking for specific CSV files:"
+        ls -la example_data/*.csv || echo "No CSV files found"
+    - name: Run CLI and GUI tests
+      run: |
+        cd test
+        python run_tests.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/test.py test/test_gui_only.py -v --tb=short --junitxml=test-results.xml
+    - name: Run tests with coverage
+      run: |
+        pytest test/test.py test/test_gui_only.py --cov=. --cov-report=xml --cov-report=html --cov-report=term
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: test-results-python-${{ matrix.python-version }}
+        path: |
+          test-results.xml
+          htmlcov/
+          coverage.xml
+  test-integration:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov
+    - name: Verify example data files
+      run: |
+        echo "Checking if example data directory exists:"
+        ls -la example_data/
+        echo "Checking for specific CSV files:"
+        ls -la example_data/*.csv || echo "No CSV files found"
+    - name: Run integration tests (CLI and GUI)
+      run: |
+        cd test
+        python run_tests.py
+    - name: Test CLI help
+      run: |
+        python cli_topics.py --help
+    - name: Test CLI version
+      run: |
+        python -c "import sys; print(f'Python {sys.version}')"
+  security:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install bandit
+    - name: Run bandit security check
+      run: |
+        bandit -r . -f json -o bandit-report.json || true
+    - name: Upload security report
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: security-report
+        path: bandit-report.json
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build package
+      run: |
+        python -m build
+    - name: Check package
+      run: |
+        twine check dist/*
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: dist
+        path: dist/

.github/workflows/simple-test.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: Simple Test Run
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov
+    - name: Verify example data files
+      run: |
+        echo "Checking if example data directory exists:"
+        ls -la example_data/ || echo "example_data directory not found"
+        echo "Checking for specific CSV files:"
+        ls -la example_data/*.csv || echo "No CSV files found"
+    - name: Run CLI and GUI tests
+      run: |
+        cd test
+        python run_tests.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/test.py test/test_gui_only.py -v --tb=short

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [dev]
+permissions:
+  contents: read
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID: ${{ secrets.HF_REPO_ID }}

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+*.pdf
+*.url
+*.jpg
+*.png
+*.ipynb
+*.xls
+*.pyc
+examples/*
+output/*
+tools/__pycache__/*
+build/*
+dist/*
+logs/*
+usage/*
+feedback/*
+test_code/*
+config/*
+tmp/*
+test/tmp/*
+unsloth_compiled_cache/*
+.vscode/*
+llm_topic_modelling.egg-info/*

Dockerfile ADDED Viewed

	@@ -0,0 +1,166 @@

+# This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS builder
+# Install system dependencies.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    gcc \
+    g++ \
+    cmake \
+    #libopenblas-dev \
+    pkg-config \
+    python3-dev \
+    libffi-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY requirements_lightweight.txt .
+# Set environment variables for OpenBLAS - not necessary if not building from source
+# ENV OPENBLAS_VERBOSE=1
+# ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
+ARG INSTALL_TORCH=False
+ENV INSTALL_TORCH=${INSTALL_TORCH}
+RUN if [ "$INSTALL_TORCH" = "True" ]; then \
+    pip install --no-cache-dir --target=/install torch==2.9.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu; \
+    fi
+ARG INSTALL_LLAMA_CPP_PYTHON=False
+ENV INSTALL_LLAMA_CPP_PYTHON=${INSTALL_LLAMA_CPP_PYTHON}
+RUN if [ "$INSTALL_LLAMA_CPP_PYTHON" = "True" ]; then \
+    pip install --no-cache-dir --target=/install https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl; \
+    fi
+RUN pip install --no-cache-dir --target=/install -r requirements_lightweight.txt
+RUN rm requirements_lightweight.txt
+# ===================================================================
+# Stage 2: A common 'base' for both Lambda and Gradio
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS base
+# Set build-time and runtime environment variable for whether to run in Gradio mode or Lambda mode
+ARG APP_MODE=gradio
+ENV APP_MODE=${APP_MODE}
+# Install runtime system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas0 \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+ENV APP_HOME=/home/user
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+    MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+    GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+    GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+    FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+    ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+    USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+    CONFIG_FOLDER=$APP_HOME/app/config/ \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+# Copy Python packages from the builder stage
+COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
+COPY --from=builder /install/bin /usr/local/bin/
+# Copy your application code and entrypoint
+COPY . ${APP_HOME}/app
+COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+# Fix line endings and set execute permissions
+RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
+    && chmod +x ${APP_HOME}/app/entrypoint.sh
+WORKDIR ${APP_HOME}/app
+# ===================================================================
+# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
+# ===================================================================
+FROM base AS lambda
+# Set runtime ENV for Lambda mode
+ENV APP_MODE=lambda
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]
+# ===================================================================
+# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
+# ===================================================================
+FROM base AS gradio
+# Set runtime ENV for Gradio mode
+ENV APP_MODE=gradio
+# Create non-root user
+RUN useradd -m -u 1000 user
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+# Create required sub-folders within the app directory and set their permissions
+RUN mkdir -p \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chown user:user \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chmod 755 \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp directories
+RUN mkdir -p /tmp/gradio_tmp /tmp/matplotlib_cache /tmp /var/tmp \
+    && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/matplotlib_cache \
+    && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/matplotlib_cache
+# Fix apply user ownership to all files in the home directory
+RUN chown -R user:user /home/user
+# Set permissions for Python executable
+RUN chmod 755 /usr/local/bin/python
+# Declare volumes
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+USER user
+EXPOSE $GRADIO_SERVER_PORT
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,176 @@

+---
+title: Large language model topic modelling
+emoji: 📚
+colorFrom: purple
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.0.2
+app_file: app.py
+pinned: true
+license: agpl-3.0
+short_description: Create thematic summaries for open text data with LLMs
+---
+# Large language model topic modelling
+Version: 0.6.0
+Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets on the main app page, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
+NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.
+Basic use:
+1. On the front page, choose your model for inference. Gemma 3/GPT-OSS will use 'on-device' inference. Calls to Gemini or AWS will require an API key that can be input on the 'LLM and topic extraction' page.
+1. Upload a csv/xlsx/parquet file containing at least one open text column.
+2. Select the relevant open text column from the dropdown.
+3. If you have your own suggested (zero shot) topics, upload this (see examples folder for an example file)
+4. Write a one sentence description of the consultation/context of the open text.
+5. Click 'Extract topics, deduplicate, and summarise'. This will run through the whole analysis process from topic extraction, to topic deduplication, to topic-level and overall summaries.
+6. A summary xlsx file workbook will be created on the front page in the box 'Overall summary xlsx file'. This will combine all the results from the different processes into one workbook.
+# Installation guide
+Here is a step-by-step guide to clone the repository, create a virtual environment, and install dependencies from the relevant `requirements` file. This guide assumes you have **Git** and **Python 3.11** installed.
+-----
+### Step 1: Clone the Git Repository
+First, you need to copy the project files to your local machine. Navigate to the directory where you want to store the project using the `cd` (change directory) command. Then, use `git clone` with the repository's URL.
+1.  **Clone the repo:**
+    ```bash
+    git clone https://github.com/seanpedrick-case/llm_topic_modelling.git
+    ```
+2.  **Navigate into the new project folder:**
+    ```bash
+    cd llm_topic_modelling
+    ```
+-----
+### Step 2: Create and Activate a Virtual Environment
+A virtual environment is a self-contained directory that holds a specific Python interpreter and its own set of installed packages. This is crucial for isolating your project's dependencies.
+NOTE: Alternatively you could also create and activate a Conda environment instead of using venv below.
+1.  **Create the virtual environment:** We'll use Python's built-in `venv` module. It's common practice to name the environment folder `.venv`.
+    ```bash
+    python -m venv .venv
+    ```
+    *This command tells Python to create a new virtual environment in a folder named `.venv`.*
+2.  **Activate the environment:** You must "activate" the environment to start using it. The command differs based on your operating system and shell.
+      * **On macOS / Linux (bash/zsh):**
+        ```bash
+        source .venv/bin/activate
+        ```
+      * **On Windows (Command Prompt):**
+        ```bash
+        .\.venv\Scripts\activate
+        ```
+      * **On Windows (PowerShell):**
+        ```powershell
+        .\.venv\Scripts\Activate.ps1
+        ```
+    You'll know it's active because your command prompt will be prefixed with `(.venv)`.
+-----
+### Step 3: Install Dependencies
+Now that your virtual environment is active, you can install all the required packages. Here you have two options, install from the pyproject.toml file (recommended), or install from requirements files.
+1. **Install from pyproject.toml (recommended)**
+You can install the 'lightweight' version of the app to access all available cloud provider or local inference (e.g. llama server, vLLM server) APIs. This version will not allow you to run local models such as Gemma 12b or GPT-OSS-20b 'in-app', i.e. accessible from the GUI interface directly. However, you will have access to AWS, Gemma, or Azure/OpenAI models with appropriate API keys. Use the following command in your environment to install the relevant packages:
+```bash
+pip install .
+```
+#### Install torch (optional)
+If you want to run inference with transformers with full/quantised models, and the associated Unsloth package, you can run the following command for CPU inference. For GPU inference, please refer to the requirements_gpu.txt guide, and the 'Install from a requirements file' section below:
+```bash
+pip install .[torch]
+```
+#### Install llama-cpp-python (optional)
+You can run quantised GGUF models in-app using llama-cpp-python. However, installation of this package is not always straightforward, particularly considering that wheels are not available for the latest version apart from for linux. This package is not being updated regularly, and so support may be removed for this package in future. Long term I would advise instead looking into running GGUF models using llama-server and calling the API from this app using the lightweight version (details here: https://github.com/ggml-org/llama.cpp).
+If you do want to install llama-cpp-python in app, first try the following command:
+```bash
+pip install .[llamacpp]
+```
+This will install the CPU version of llama-cpp-python. If you want GPU support, first I would try using pip install with specific wheels for your system, e.g. for Linux: See files in https://github.com/abetlen/llama-cpp-python/releases/tag/v0.3.16-cu124 . If you are still struggling, see here for more details on installation here: https://llama-cpp-python.readthedocs.io/en/latest
+**NOTE:** A sister repository contains [llama-cpp-python 3.16 wheels for Python version 3.11/10](https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/tag/v0.1.0) so that users can avoid having to build the package from source. I also have a guide to building the package on a Windows system [here](https://github.com/seanpedrick-case/llm_topic_modelling/blob/main/windows_install_llama-cpp-python.txt).
+#### Install mcp version of gradio
+You can install an mcp-compatible version of gradio for this app with the following command:
+```bash
+pip install .[mcp]
+```
+2. **Install from a requirements file (not recommended)**
+The repo provides several requirements files that are relevant for different situations. To start, I advise installing using the **requirements_lightweight.txt** file, which installs the app with access to all cloud provider or local inference (e.g. llama server, vLLM server) APIs. This approach is much simpler as a first step, and avoids issues with potentially complicated llama-cpp-python installation and GPU management described below.
+If you want to run models locally 'in app', then you have two further requirements files to choose from:
+- **requirements_cpu.txt**: Used for Python 3.11 CPU-only environments. Uncomment the requirements under 'Windows' for Windows compatibility. Make sure you have [Openblas](https://github.com/OpenMathLib/OpenBLAS) installed!
+- **requirements_gpu.txt**: Used for Python 3.11 GPU-enabled environments. Uncomment the requirements under 'Windows' for Windows compatibility (CUDA 12.4).
+Example The below instructions will guide you in how to install the GPU-enabled version of the app for local inference.
+**Install packages for local model 'in-app' inference from the requirements file:**
+    ```bash
+    pip install -r requirements_gpu.txt
+    ```
+    *This command reads every package name listed in the file and installs it into your `.venv` environment.*
+NOTE: If default llama-cpp-python installation does not work when installing from the above, go into the requirements_gpu.txt file and uncomment the lines to install a wheel for llama-cpp-python 0.3.16 relevant to your system.
+### Step 4: Verify CUDA compatibility (if using a GPU environment)
+Install the relevant toolkit for CUDA 12.4 from here: https://developer.nvidia.com/cuda-12-4-0-download-archive
+Restart your computer
+Ensure you have the latest drivers for your NVIDIA GPU. Check your current version and memory availability by running nvidia-smi
+In command line, CUDA compatibility can be checked by running nvcc --version
+### Step 5: Ensure you have compatible NVIDIA drivers
+Make sure you have the latest NVIDIA drivers installed on your system for your GPU (be careful in particular if using WSL that you have drivers compatible with this). Official drivers can be found here: https://www.nvidia.com/en-us/drivers
+Current drivers can be found by running nvidia-smi in command line
+### Step 6: Run the app
+Go to the app project directory. Run python app.py
+### Step 7: (optional) change default configuration
+A number of configuration options can be seen the tools/config.py file. You can either pass in these variables as environment variables, or you can create a file in config/app_config.env to read this into the app on initialisation.

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cli_topics.py ADDED Viewed

	@@ -0,0 +1,1943 @@

+import argparse
+import csv
+import os
+import time
+import uuid
+from datetime import datetime
+import boto3
+import botocore
+import pandas as pd
+from tools.aws_functions import download_file_from_s3, export_outputs_to_s3
+from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
+from tools.config import (
+    API_URL,
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    AZURE_OPENAI_API_KEY,
+    AZURE_OPENAI_INFERENCE_ENDPOINT,
+    BATCH_SIZE_DEFAULT,
+    CHOSEN_INFERENCE_SERVER_MODEL,
+    CSV_USAGE_LOG_HEADERS,
+    DEDUPLICATION_THRESHOLD,
+    DEFAULT_COST_CODE,
+    DEFAULT_SAMPLED_SUMMARIES,
+    DYNAMODB_USAGE_LOG_HEADERS,
+    GEMINI_API_KEY,
+    GRADIO_TEMP_DIR,
+    HF_TOKEN,
+    INPUT_FOLDER,
+    LLM_MAX_NEW_TOKENS,
+    LLM_SEED,
+    LLM_TEMPERATURE,
+    MAX_TIME_FOR_LOOP,
+    OUTPUT_DEBUG_FILES,
+    OUTPUT_FOLDER,
+    RUN_AWS_FUNCTIONS,
+    S3_OUTPUTS_BUCKET,
+    S3_OUTPUTS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    SAVE_OUTPUTS_TO_S3,
+    SESSION_OUTPUT_FOLDER,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+    USAGE_LOG_FILE_NAME,
+    USAGE_LOGS_FOLDER,
+    convert_string_to_boolean,
+    default_model_choice,
+    default_model_source,
+    model_name_map,
+)
+from tools.dedup_summaries import (
+    deduplicate_topics,
+    deduplicate_topics_llm,
+    overall_summary,
+    wrapper_summarise_output_topics_per_group,
+)
+from tools.helper_functions import (
+    load_in_data_file,
+    load_in_previous_data_files,
+)
+from tools.llm_api_call import (
+    all_in_one_pipeline,
+    validate_topics_wrapper,
+    wrapper_extract_topics_per_column_value,
+)
+from tools.prompts import (
+    add_existing_topics_prompt,
+    add_existing_topics_system_prompt,
+    initial_table_prompt,
+    initial_table_system_prompt,
+    single_para_summary_format_prompt,
+    two_para_summary_format_prompt,
+)
+def _generate_session_hash() -> str:
+    """Generate a unique session hash for logging purposes."""
+    return str(uuid.uuid4())[:8]
+def _download_s3_file_if_needed(
+    file_path: str,
+    default_filename: str = "downloaded_file",
+    aws_access_key: str = "",
+    aws_secret_key: str = "",
+    aws_region: str = "",
+) -> str:
+    """
+    Download a file from S3 if the path starts with 's3://' or 'S3://', otherwise return the path as-is.
+    Args:
+        file_path: File path (either local or S3 URL)
+        default_filename: Default filename to use if S3 key doesn't have a filename
+        aws_access_key: AWS access key ID (optional, uses environment/config if not provided)
+        aws_secret_key: AWS secret access key (optional, uses environment/config if not provided)
+        aws_region: AWS region (optional, uses environment/config if not provided)
+    Returns:
+        Local file path (downloaded from S3 or original path)
+    """
+    if not file_path:
+        return file_path
+    # Check for S3 URL (case-insensitive)
+    file_path_stripped = file_path.strip()
+    file_path_upper = file_path_stripped.upper()
+    if not file_path_upper.startswith("S3://"):
+        return file_path
+    # Ensure temp directory exists
+    os.makedirs(GRADIO_TEMP_DIR, exist_ok=True)
+    # Parse S3 URL: s3://bucket/key (preserve original case for bucket/key)
+    # Remove 's3://' prefix (case-insensitive)
+    s3_path = (
+        file_path_stripped.split("://", 1)[1]
+        if "://" in file_path_stripped
+        else file_path_stripped
+    )
+    # Split bucket and key (first '/' separates bucket from key)
+    if "/" in s3_path:
+        bucket_name_s3, s3_key = s3_path.split("/", 1)
+    else:
+        # If no key provided, use bucket name as key (unlikely but handle it)
+        bucket_name_s3 = s3_path
+        s3_key = ""
+    # Get the filename from the S3 key
+    filename = os.path.basename(s3_key) if s3_key else bucket_name_s3
+    if not filename:
+        filename = default_filename
+    # Create local file path in temp directory
+    local_file_path = os.path.join(GRADIO_TEMP_DIR, filename)
+    # Download file from S3
+    try:
+        download_file_from_s3(
+            bucket_name=bucket_name_s3,
+            key=s3_key,
+            local_file_path=local_file_path,
+            aws_access_key_textbox=aws_access_key,
+            aws_secret_key_textbox=aws_secret_key,
+            aws_region_textbox=aws_region,
+        )
+        print(f"S3 file downloaded successfully: {file_path} -> {local_file_path}")
+        return local_file_path
+    except Exception as e:
+        print(f"Error downloading file from S3 ({file_path}): {e}")
+        raise Exception(f"Failed to download file from S3: {e}")
+def get_username_and_folders(
+    username: str = "",
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: bool = SESSION_OUTPUT_FOLDER,
+):
+    """Generate session hash and set up output/input folders."""
+    # Generate session hash for logging. Either from input user name or generated
+    if username:
+        out_session_hash = username
+    else:
+        out_session_hash = _generate_session_hash()
+    if session_output_folder:
+        output_folder = output_folder_textbox + out_session_hash + "/"
+        input_folder = input_folder_textbox + out_session_hash + "/"
+    else:
+        output_folder = output_folder_textbox
+        input_folder = input_folder_textbox
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder, exist_ok=True)
+    if not os.path.exists(input_folder):
+        os.makedirs(input_folder, exist_ok=True)
+    return (
+        out_session_hash,
+        output_folder,
+        out_session_hash,
+        input_folder,
+    )
+def upload_outputs_to_s3_if_enabled(
+    output_files: list,
+    base_file_name: str = None,
+    session_hash: str = "",
+    s3_output_folder: str = S3_OUTPUTS_FOLDER,
+    s3_bucket: str = S3_OUTPUTS_BUCKET,
+    save_outputs_to_s3: bool = None,
+):
+    """
+    Upload output files to S3 if SAVE_OUTPUTS_TO_S3 is enabled.
+    Args:
+        output_files: List of output file paths to upload
+        base_file_name: Base file name (input file) for organizing S3 folder structure
+        session_hash: Session hash to include in S3 path
+        s3_output_folder: S3 output folder path
+        s3_bucket: S3 bucket name
+        save_outputs_to_s3: Override for SAVE_OUTPUTS_TO_S3 config (if None, uses config value)
+    """
+    # Use provided value or fall back to config
+    if save_outputs_to_s3 is None:
+        save_outputs_to_s3 = convert_string_to_boolean(SAVE_OUTPUTS_TO_S3)
+    if not save_outputs_to_s3:
+        return
+    if not s3_bucket:
+        print("Warning: S3_OUTPUTS_BUCKET not configured. Skipping S3 upload.")
+        return
+    if not output_files:
+        print("No output files to upload to S3.")
+        return
+    # Filter out empty/None values and ensure files exist
+    valid_files = []
+    for file_path in output_files:
+        if file_path and os.path.exists(file_path):
+            valid_files.append(file_path)
+        elif file_path:
+            print(f"Warning: Output file does not exist, skipping: {file_path}")
+    if not valid_files:
+        print("No valid output files to upload to S3.")
+        return
+    # Construct S3 output folder path
+    # Include session hash if provided and SESSION_OUTPUT_FOLDER is enabled
+    s3_folder_path = s3_output_folder or ""
+    if session_hash and convert_string_to_boolean(SESSION_OUTPUT_FOLDER):
+        if s3_folder_path and not s3_folder_path.endswith("/"):
+            s3_folder_path += "/"
+        s3_folder_path += session_hash + "/"
+    print(f"\nUploading {len(valid_files)} output file(s) to S3...")
+    try:
+        export_outputs_to_s3(
+            file_list_state=valid_files,
+            s3_output_folder_state_value=s3_folder_path,
+            save_outputs_to_s3_flag=True,
+            base_file_state=base_file_name,
+            s3_bucket=s3_bucket,
+        )
+    except Exception as e:
+        print(f"Warning: Failed to upload outputs to S3: {e}")
+def write_usage_log(
+    session_hash: str,
+    file_name: str,
+    text_column: str,
+    model_choice: str,
+    conversation_metadata: str,
+    input_tokens: int,
+    output_tokens: int,
+    number_of_calls: int,
+    estimated_time_taken: float,
+    cost_code: str = DEFAULT_COST_CODE,
+    save_to_csv: bool = SAVE_LOGS_TO_CSV,
+    save_to_dynamodb: bool = SAVE_LOGS_TO_DYNAMODB,
+    include_conversation_metadata: bool = False,
+):
+    """
+    Write usage log entry to CSV file and/or DynamoDB.
+    Args:
+        session_hash: Session identifier
+        file_name: Name of the input file
+        text_column: Column name used for analysis (as list for CSV)
+        model_choice: LLM model used
+        conversation_metadata: Metadata string
+        input_tokens: Number of input tokens
+        output_tokens: Number of output tokens
+        number_of_calls: Number of LLM calls
+        estimated_time_taken: Time taken in seconds
+        cost_code: Cost code for tracking
+        save_to_csv: Whether to save to CSV
+        save_to_dynamodb: Whether to save to DynamoDB
+        include_conversation_metadata: Whether to include conversation metadata in the log
+    """
+    # Convert boolean parameters if they're strings
+    if isinstance(save_to_csv, str):
+        save_to_csv = convert_string_to_boolean(save_to_csv)
+    if isinstance(save_to_dynamodb, str):
+        save_to_dynamodb = convert_string_to_boolean(save_to_dynamodb)
+    # Return early if neither logging method is enabled
+    if not save_to_csv and not save_to_dynamodb:
+        return
+    if not conversation_metadata:
+        conversation_metadata = ""
+    # Ensure usage logs folder exists
+    os.makedirs(USAGE_LOGS_FOLDER, exist_ok=True)
+    # Construct full file path
+    usage_log_file_path = os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+    # Prepare data row - order matches app.py component order
+    # session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice,
+    # conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num,
+    # number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop
+    data = [
+        session_hash,
+        file_name,
+        (
+            text_column
+            if isinstance(text_column, str)
+            else (text_column[0] if text_column else "")
+        ),
+        model_choice,
+        conversation_metadata if conversation_metadata else "",
+        input_tokens,
+        output_tokens,
+        number_of_calls,
+        estimated_time_taken,
+        cost_code,
+    ]
+    # Add id and timestamp
+    generated_id = str(uuid.uuid4())
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+    data.extend([generated_id, timestamp])
+    # Use custom headers if available, otherwise use default
+    # Note: CSVLogger_custom uses component labels, but we need to match what collect_output_csvs_and_create_excel_output expects
+    if CSV_USAGE_LOG_HEADERS and len(CSV_USAGE_LOG_HEADERS) == len(data):
+        headers = CSV_USAGE_LOG_HEADERS
+    else:
+        # Default headers - these should match what CSVLogger_custom creates from Gradio component labels
+        # The components are: session_hash_textbox, original_data_file_name_textbox, in_colnames,
+        # model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num,
+        # number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop
+        # Since these are hidden components without labels, CSVLogger_custom uses component variable names
+        # or default labels. We need to match what collect_output_csvs_and_create_excel_output expects:
+        # "Total LLM calls", "Total input tokens", "Total output tokens"
+        # But the actual CSV from Gradio likely has: "Number of calls", "Input tokens", "Output tokens"
+        # Let's use the names that match what the Excel function expects
+        headers = [
+            "Session hash",
+            "Reference data file name",
+            "Select the open text column of interest. In an Excel file, this shows columns across all sheets.",
+            "Large language model for topic extraction and summarisation",
+            "Conversation metadata",
+            "Total input tokens",  # Changed from "Input tokens" to match Excel function
+            "Total output tokens",  # Changed from "Output tokens" to match Excel function
+            "Total LLM calls",  # Changed from "Number of calls" to match Excel function
+            "Estimated time taken (seconds)",
+            "Cost code",
+            "id",
+            "timestamp",
+        ]
+    # Write to CSV if enabled
+    if save_to_csv:
+        # Ensure usage logs folder exists
+        os.makedirs(USAGE_LOGS_FOLDER, exist_ok=True)
+        # Construct full file path
+        usage_log_file_path = os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+        # Write to CSV
+        file_exists = os.path.exists(usage_log_file_path)
+        with open(
+            usage_log_file_path, "a", newline="", encoding="utf-8-sig"
+        ) as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                # Write headers if file doesn't exist
+                writer.writerow(headers)
+            writer.writerow(data)
+    # Write to DynamoDB if enabled
+    if save_to_dynamodb:
+        # DynamoDB logging implementation
+        print("Saving to DynamoDB")
+        try:
+            # Connect to DynamoDB
+            if RUN_AWS_FUNCTIONS == "1":
+                try:
+                    print("Connecting to DynamoDB via existing SSO connection")
+                    dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
+                    dynamodb.meta.client.list_tables()
+                except Exception as e:
+                    print("No SSO credentials found:", e)
+                    if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+                        print("Trying DynamoDB credentials from environment variables")
+                        dynamodb = boto3.resource(
+                            "dynamodb",
+                            aws_access_key_id=AWS_ACCESS_KEY,
+                            aws_secret_access_key=AWS_SECRET_KEY,
+                            region_name=AWS_REGION,
+                        )
+                    else:
+                        raise Exception(
+                            "AWS credentials for DynamoDB logging not found"
+                        )
+            else:
+                raise Exception("AWS credentials for DynamoDB logging not found")
+            # Get table name from config
+            dynamodb_table_name = USAGE_LOG_DYNAMODB_TABLE_NAME
+            if not dynamodb_table_name:
+                raise ValueError(
+                    "USAGE_LOG_DYNAMODB_TABLE_NAME not configured. Cannot save to DynamoDB."
+                )
+            # Determine headers for DynamoDB
+            # Use DYNAMODB_USAGE_LOG_HEADERS if available and matches data length,
+            # otherwise use CSV_USAGE_LOG_HEADERS if it matches, otherwise use default headers
+            # Note: headers and data are guaranteed to have the same length and include id/timestamp
+            if DYNAMODB_USAGE_LOG_HEADERS and len(DYNAMODB_USAGE_LOG_HEADERS) == len(
+                data
+            ):
+                dynamodb_headers = list(DYNAMODB_USAGE_LOG_HEADERS)  # Make a copy
+            elif CSV_USAGE_LOG_HEADERS and len(CSV_USAGE_LOG_HEADERS) == len(data):
+                dynamodb_headers = list(CSV_USAGE_LOG_HEADERS)  # Make a copy
+            else:
+                # Use the headers we created which are guaranteed to match data
+                dynamodb_headers = headers
+            # Check if table exists, create if it doesn't
+            try:
+                table = dynamodb.Table(dynamodb_table_name)
+                table.load()
+            except botocore.exceptions.ClientError as e:
+                if e.response["Error"]["Code"] == "ResourceNotFoundException":
+                    print(
+                        f"Table '{dynamodb_table_name}' does not exist. Creating it..."
+                    )
+                    attribute_definitions = [
+                        {
+                            "AttributeName": "id",
+                            "AttributeType": "S",
+                        }
+                    ]
+                    table = dynamodb.create_table(
+                        TableName=dynamodb_table_name,
+                        KeySchema=[{"AttributeName": "id", "KeyType": "HASH"}],
+                        AttributeDefinitions=attribute_definitions,
+                        BillingMode="PAY_PER_REQUEST",
+                    )
+                    # Wait until the table exists
+                    table.meta.client.get_waiter("table_exists").wait(
+                        TableName=dynamodb_table_name
+                    )
+                    time.sleep(5)
+                    print(f"Table '{dynamodb_table_name}' created successfully.")
+                else:
+                    raise
+            # Prepare the DynamoDB item to upload
+            # Map the headers to values (headers and data should match in length)
+            if len(dynamodb_headers) == len(data):
+                item = {
+                    header: str(value) for header, value in zip(dynamodb_headers, data)
+                }
+            else:
+                # Fallback: use the default headers which are guaranteed to match data
+                print(
+                    f"Warning: DynamoDB headers length ({len(dynamodb_headers)}) doesn't match data length ({len(data)}). Using default headers."
+                )
+                item = {header: str(value) for header, value in zip(headers, data)}
+            # Upload to DynamoDB
+            table.put_item(Item=item)
+            print("Successfully uploaded log to DynamoDB")
+        except Exception as e:
+            print(f"Could not upload log to DynamoDB due to: {e}")
+            import traceback
+            traceback.print_exc()
+# --- Main CLI Function ---
+def main(direct_mode_args={}):
+    """
+    A unified command-line interface for topic extraction, validation, deduplication, and summarisation.
+    Args:
+        direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
+                                          If provided, uses these instead of parsing command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="A versatile CLI for topic extraction, validation, deduplication, and summarisation using LLMs.",
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog="""
+Examples:
+To run these, you need to do the following:
+- Open a terminal window
+- CD to the app folder that contains this file (cli_topics.py)
+- Load the virtual environment using either conda or venv depending on your setup
+- Run one of the example commands below
+- The examples below use the free Gemini 2.5 Flash Lite model, that is free with an API key that you can get from here: https://aistudio.google.com/api-keys. You can either set this or API keys for other services as an environment variable (e.g. in config/app_config.py. See the file tools/config.py for more details about variables relevant to each service) or you can set them manually at the time of the function call via command line arguments, such as the following:
+    Google/Gemini: --google_api_key
+    AWS Bedrock: --aws_access_key, --aws_secret_key, --aws_region
+    Hugging Face (for model download): --hf_token
+    Azure/OpenAI: --azure_api_key, --azure_endpoint
+    Inference Server endpoint for local models(e.g. llama server, vllm): --api_url
+- Use --create_xlsx_output to create an Excel file combining all CSV outputs after task completion
+- Look in the output/ folder to see output files:
+# Topic Extraction
+## Extract topics from a CSV file with default settings:
+python cli_topics.py --task extract --input_file example_data/combined_case_notes.csv --text_column "Case Note"
+## Extract topics with custom model and context:
+python cli_topics.py --task extract --input_file example_data/combined_case_notes.csv --text_column "Case Note" --model_choice "gemini-2.5-flash-lite" --context "Social Care case notes for young people"
+## Extract topics with grouping:
+python cli_topics.py --task extract --input_file example_data/combined_case_notes.csv --text_column "Case Note" --group_by "Client"
+## Extract topics with candidate topics (zero-shot):
+python cli_topics.py --task extract --input_file example_data/dummy_consultation_response.csv --text_column "Response text" --candidate_topics example_data/dummy_consultation_response_themes.csv
+# Topic Validation
+## Validate previously extracted topics:
+python cli_topics.py --task validate --input_file example_data/combined_case_notes.csv --text_column "Case Note" --previous_output_files output/combined_case_notes_col_Case_Note_reference_table.csv output/combined_case_notes_col_Case_Note_unique_topics.csv
+# Deduplication
+Note: you will need to change the reference to previous output files to match the exact file names created from the previous task. This includes the relative path to the app folder. Also, the function will create an xlsx output file by default. the --input_file and --text_column arguments are needed for this, unless you pass in --no_xlsx_output as seen below.
+## Deduplicate topics using fuzzy matching:
+python cli_topics.py --task deduplicate --previous_output_files output/combined_case_notes_col_Case_Note_reference_table.csv output/combined_case_notes_col_Case_Note_unique_topics.csv --similarity_threshold 90 --no_xlsx_output
+## Deduplicate topics using LLM:
+python cli_topics.py --task deduplicate --previous_output_files output/combined_case_notes_col_Case_Note_reference_table.csv output/combined_case_notes_col_Case_Note_unique_topics.csv --method llm --model_choice "gemini-2.5-flash-lite" --no_xlsx_output
+# Summarisation
+Note: you will need to change the reference to previous output files to match the exact file names created from the previous task. This includes the relative path to the app folder. Also, the function will create an xlsx output file by default. the --input_file and --text_column arguments are needed for this, unless you pass in --no_xlsx_output as seen below.
+## Summarise topics:
+python cli_topics.py --task summarise --previous_output_files output/combined_case_notes_col_Case_Note_reference_table.csv output/combined_case_notes_col_Case_Note_unique_topics.csv --model_choice "gemini-2.5-flash-lite" --no_xlsx_output
+## Create overall summary:
+python cli_topics.py --task overall_summary --previous_output_files output/combined_case_notes_col_Case_Note_unique_topics.csv --model_choice "gemini-2.5-flash-lite" --no_xlsx_output
+# All-in-one pipeline
+## Run complete pipeline (extract, deduplicate, summarise):
+python cli_topics.py --task all_in_one --input_file example_data/combined_case_notes.csv --text_column "Case Note" --model_choice "gemini-2.5-flash-lite"
+""",
+    )
+    # --- Task Selection ---
+    task_group = parser.add_argument_group("Task Selection")
+    task_group.add_argument(
+        "--task",
+        choices=[
+            "extract",
+            "validate",
+            "deduplicate",
+            "summarise",
+            "overall_summary",
+            "all_in_one",
+        ],
+        default="extract",
+        help="Task to perform: extract (topic extraction), validate (validate topics), deduplicate (deduplicate topics), summarise (summarise topics), overall_summary (create overall summary), or all_in_one (complete pipeline).",
+    )
+    # --- General Arguments ---
+    general_group = parser.add_argument_group("General Options")
+    general_group.add_argument(
+        "--input_file",
+        nargs="+",
+        help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.",
+    )
+    general_group.add_argument(
+        "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files."
+    )
+    general_group.add_argument(
+        "--input_dir", default=INPUT_FOLDER, help="Directory for all input files."
+    )
+    general_group.add_argument(
+        "--text_column",
+        help="Name of the text column to process (required for extract, validate, and all_in_one tasks).",
+    )
+    general_group.add_argument(
+        "--previous_output_files",
+        nargs="+",
+        help="Path(s) to previous output files (reference_table and/or unique_topics files) for validate, deduplicate, summarise, and overall_summary tasks.",
+    )
+    general_group.add_argument(
+        "--username", default="", help="Username for the session."
+    )
+    general_group.add_argument(
+        "--save_to_user_folders",
+        default=SESSION_OUTPUT_FOLDER,
+        help="Whether to save to user folders or not.",
+    )
+    general_group.add_argument(
+        "--excel_sheets",
+        nargs="+",
+        default=list(),
+        help="Specific Excel sheet names to process.",
+    )
+    general_group.add_argument(
+        "--group_by",
+        help="Column name to group results by.",
+    )
+    # --- Model Configuration ---
+    model_group = parser.add_argument_group("Model Configuration")
+    model_group.add_argument(
+        "--model_choice",
+        default=default_model_choice,
+        help=f"LLM model to use. Default: {default_model_choice}",
+    )
+    model_group.add_argument(
+        "--model_source",
+        default=default_model_source,
+        help=f"Model source (e.g., 'Google', 'AWS', 'Local'). Default: {default_model_source}",
+    )
+    model_group.add_argument(
+        "--temperature",
+        type=float,
+        default=LLM_TEMPERATURE,
+        help=f"Temperature for LLM generation. Default: {LLM_TEMPERATURE}",
+    )
+    model_group.add_argument(
+        "--batch_size",
+        type=int,
+        default=BATCH_SIZE_DEFAULT,
+        help=f"Number of responses to submit in a single LLM query. Default: {BATCH_SIZE_DEFAULT}",
+    )
+    model_group.add_argument(
+        "--max_tokens",
+        type=int,
+        default=LLM_MAX_NEW_TOKENS,
+        help=f"Maximum tokens for LLM generation. Default: {LLM_MAX_NEW_TOKENS}",
+    )
+    model_group.add_argument(
+        "--google_api_key",
+        default=GEMINI_API_KEY,
+        help="Google API key for Gemini models.",
+    )
+    model_group.add_argument(
+        "--aws_access_key",
+        default=AWS_ACCESS_KEY,
+        help="AWS Access Key ID for Bedrock models.",
+    )
+    model_group.add_argument(
+        "--aws_secret_key",
+        default=AWS_SECRET_KEY,
+        help="AWS Secret Access Key for Bedrock models.",
+    )
+    model_group.add_argument(
+        "--aws_region",
+        default=AWS_REGION,
+        help="AWS region for Bedrock models.",
+    )
+    model_group.add_argument(
+        "--hf_token",
+        default=HF_TOKEN,
+        help="Hugging Face token for downloading gated models.",
+    )
+    model_group.add_argument(
+        "--azure_api_key",
+        default=AZURE_OPENAI_API_KEY,
+        help="Azure/OpenAI API key for Azure/OpenAI models.",
+    )
+    model_group.add_argument(
+        "--azure_endpoint",
+        default=AZURE_OPENAI_INFERENCE_ENDPOINT,
+        help="Azure Inference endpoint URL.",
+    )
+    model_group.add_argument(
+        "--api_url",
+        default=API_URL,
+        help=f"Inference server API URL (for local models). Default: {API_URL}",
+    )
+    model_group.add_argument(
+        "--inference_server_model",
+        default=CHOSEN_INFERENCE_SERVER_MODEL,
+        help=f"Inference server model name to use. Default: {CHOSEN_INFERENCE_SERVER_MODEL}",
+    )
+    # --- Topic Extraction Arguments ---
+    extract_group = parser.add_argument_group("Topic Extraction Options")
+    extract_group.add_argument(
+        "--context",
+        default="",
+        help="Context sentence to provide to the LLM for topic extraction.",
+    )
+    extract_group.add_argument(
+        "--candidate_topics",
+        help="Path to CSV file with candidate topics for zero-shot extraction.",
+    )
+    extract_group.add_argument(
+        "--force_zero_shot",
+        choices=["Yes", "No"],
+        default="No",
+        help="Force responses into suggested topics. Default: No",
+    )
+    extract_group.add_argument(
+        "--force_single_topic",
+        choices=["Yes", "No"],
+        default="No",
+        help="Ask the model to assign responses to only a single topic. Default: No",
+    )
+    extract_group.add_argument(
+        "--produce_structured_summary",
+        choices=["Yes", "No"],
+        default="No",
+        help="Produce structured summaries using suggested topics as headers. Default: No",
+    )
+    extract_group.add_argument(
+        "--sentiment",
+        choices=[
+            "Negative or Positive",
+            "Negative, Neutral, or Positive",
+            "Do not assess sentiment",
+        ],
+        default="Negative or Positive",
+        help="Response sentiment analysis option. Default: Negative or Positive",
+    )
+    extract_group.add_argument(
+        "--additional_summary_instructions",
+        default="",
+        help="Additional instructions for summary format.",
+    )
+    # --- Validation Arguments ---
+    validate_group = parser.add_argument_group("Topic Validation Options")
+    validate_group.add_argument(
+        "--additional_validation_issues",
+        default="",
+        help="Additional validation issues for the model to consider (bullet-point list).",
+    )
+    validate_group.add_argument(
+        "--show_previous_table",
+        choices=["Yes", "No"],
+        default="Yes",
+        help="Provide response data to validation process. Default: Yes",
+    )
+    validate_group.add_argument(
+        "--output_debug_files",
+        choices=["True", "False"],
+        default=OUTPUT_DEBUG_FILES,
+        help=f"Output debug files. Default: {OUTPUT_DEBUG_FILES}",
+    )
+    validate_group.add_argument(
+        "--max_time_for_loop",
+        type=int,
+        default=MAX_TIME_FOR_LOOP,
+        help=f"Maximum time for validation loop in seconds. Default: {MAX_TIME_FOR_LOOP}",
+    )
+    # --- Deduplication Arguments ---
+    dedup_group = parser.add_argument_group("Deduplication Options")
+    dedup_group.add_argument(
+        "--method",
+        choices=["fuzzy", "llm"],
+        default="fuzzy",
+        help="Deduplication method: fuzzy (fuzzy matching) or llm (LLM semantic matching). Default: fuzzy",
+    )
+    dedup_group.add_argument(
+        "--similarity_threshold",
+        type=int,
+        default=DEDUPLICATION_THRESHOLD,
+        help=f"Similarity threshold (0-100) for fuzzy matching. Default: {DEDUPLICATION_THRESHOLD}",
+    )
+    dedup_group.add_argument(
+        "--merge_sentiment",
+        choices=["Yes", "No"],
+        default="No",
+        help="Merge sentiment values together for duplicate subtopics. Default: No",
+    )
+    dedup_group.add_argument(
+        "--merge_general_topics",
+        choices=["Yes", "No"],
+        default="Yes",
+        help="Merge general topic values together for duplicate subtopics. Default: Yes",
+    )
+    # --- Summarisation Arguments ---
+    summarise_group = parser.add_argument_group("Summarisation Options")
+    summarise_group.add_argument(
+        "--summary_format",
+        choices=["two_paragraph", "single_paragraph"],
+        default="two_paragraph",
+        help="Summary format type. Default: two_paragraph",
+    )
+    summarise_group.add_argument(
+        "--sample_reference_table",
+        choices=["True", "False"],
+        default="True",
+        help="Sample reference table (recommended for large datasets). Default: True",
+    )
+    summarise_group.add_argument(
+        "--no_of_sampled_summaries",
+        type=int,
+        default=DEFAULT_SAMPLED_SUMMARIES,
+        help=f"Number of summaries per group. Default: {DEFAULT_SAMPLED_SUMMARIES}",
+    )
+    summarise_group.add_argument(
+        "--random_seed",
+        type=int,
+        default=LLM_SEED,
+        help=f"Random seed for sampling. Default: {LLM_SEED}",
+    )
+    # --- Output Format Arguments ---
+    output_group = parser.add_argument_group("Output Format Options")
+    output_group.add_argument(
+        "--no_xlsx_output",
+        dest="create_xlsx_output",
+        action="store_false",
+        default=True,
+        help="Disable creation of Excel (.xlsx) output file. By default, Excel output is created.",
+    )
+    # --- Logging Arguments ---
+    logging_group = parser.add_argument_group("Logging Options")
+    logging_group.add_argument(
+        "--save_logs_to_csv",
+        default=SAVE_LOGS_TO_CSV,
+        help="Save processing logs to CSV files.",
+    )
+    logging_group.add_argument(
+        "--save_logs_to_dynamodb",
+        default=SAVE_LOGS_TO_DYNAMODB,
+        help="Save processing logs to DynamoDB.",
+    )
+    logging_group.add_argument(
+        "--usage_logs_folder",
+        default=USAGE_LOGS_FOLDER,
+        help="Directory for usage log files.",
+    )
+    logging_group.add_argument(
+        "--cost_code",
+        default=DEFAULT_COST_CODE,
+        help="Cost code for tracking usage.",
+    )
+    # Parse arguments - either from command line or direct mode
+    if direct_mode_args:
+        # Use direct mode arguments
+        args = argparse.Namespace(**direct_mode_args)
+    else:
+        # Parse command line arguments
+        args = parser.parse_args()
+    # --- Handle S3 file downloads ---
+    # Get AWS credentials from args or fall back to config values
+    aws_access_key = getattr(args, "aws_access_key", None) or AWS_ACCESS_KEY or ""
+    aws_secret_key = getattr(args, "aws_secret_key", None) or AWS_SECRET_KEY or ""
+    aws_region = getattr(args, "aws_region", None) or AWS_REGION or ""
+    # Download input files from S3 if needed
+    # Note: args.input_file is typically a list (from CLI nargs="+" or from direct mode)
+    # but we also handle pipe-separated strings for compatibility
+    if args.input_file:
+        if isinstance(args.input_file, list):
+            # Handle list of files (may include S3 paths)
+            downloaded_files = []
+            for file_path in args.input_file:
+                downloaded_path = _download_s3_file_if_needed(
+                    file_path,
+                    aws_access_key=aws_access_key,
+                    aws_secret_key=aws_secret_key,
+                    aws_region=aws_region,
+                )
+                downloaded_files.append(downloaded_path)
+            args.input_file = downloaded_files
+        elif isinstance(args.input_file, str):
+            # Handle pipe-separated string (for direct mode compatibility)
+            if "|" in args.input_file:
+                file_list = [f.strip() for f in args.input_file.split("|") if f.strip()]
+                downloaded_files = []
+                for file_path in file_list:
+                    downloaded_path = _download_s3_file_if_needed(
+                        file_path,
+                        aws_access_key=aws_access_key,
+                        aws_secret_key=aws_secret_key,
+                        aws_region=aws_region,
+                    )
+                    downloaded_files.append(downloaded_path)
+                args.input_file = downloaded_files
+            else:
+                # Single file path
+                args.input_file = [
+                    _download_s3_file_if_needed(
+                        args.input_file,
+                        aws_access_key=aws_access_key,
+                        aws_secret_key=aws_secret_key,
+                        aws_region=aws_region,
+                    )
+                ]
+    # Download candidate topics file from S3 if needed
+    if args.candidate_topics:
+        args.candidate_topics = _download_s3_file_if_needed(
+            args.candidate_topics,
+            default_filename="downloaded_candidate_topics",
+            aws_access_key=aws_access_key,
+            aws_secret_key=aws_secret_key,
+            aws_region=aws_region,
+        )
+    # --- Override model_choice with inference_server_model if provided ---
+    # If inference_server_model is explicitly provided, use it to override model_choice
+    # This allows users to specify which inference-server model to use
+    if args.inference_server_model:
+        # Check if the current model_choice is an inference-server model
+        model_source = model_name_map.get(args.model_choice, {}).get(
+            "source", default_model_source
+        )
+        # If model_source is "inference-server" OR if inference_server_model is explicitly provided
+        # (different from default), use it
+        if (
+            model_source == "inference-server"
+            or args.inference_server_model != CHOSEN_INFERENCE_SERVER_MODEL
+        ):
+            args.model_choice = args.inference_server_model
+            # Ensure the model is registered in model_name_map with inference-server source
+            if args.model_choice not in model_name_map:
+                model_name_map[args.model_choice] = {
+                    "short_name": args.model_choice,
+                    "source": "inference-server",
+                }
+            # Also update the model_source to ensure it's set correctly
+            model_name_map[args.model_choice]["source"] = "inference-server"
+    # --- Initial Setup ---
+    # Convert string boolean variables to boolean
+    args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders)
+    args.save_logs_to_csv = convert_string_to_boolean(str(args.save_logs_to_csv))
+    args.save_logs_to_dynamodb = convert_string_to_boolean(
+        str(args.save_logs_to_dynamodb)
+    )
+    args.sample_reference_table = args.sample_reference_table == "True"
+    args.output_debug_files = args.output_debug_files == "True"
+    # Get username and folders
+    (
+        session_hash,
+        args.output_dir,
+        _,
+        args.input_dir,
+    ) = get_username_and_folders(
+        username=args.username,
+        output_folder_textbox=args.output_dir,
+        input_folder_textbox=args.input_dir,
+        session_output_folder=args.save_to_user_folders,
+    )
+    print(
+        f"Conducting analyses with user {args.username or session_hash}. Outputs will be saved to {args.output_dir}."
+    )
+    # --- Route to the Correct Workflow Based on Task ---
+    # Validate input_file requirement for tasks that need it
+    if args.task in ["extract", "validate", "all_in_one"] and not args.input_file:
+        print(f"Error: --input_file is required for '{args.task}' task.")
+        return
+    if (
+        args.task in ["validate", "deduplicate", "summarise", "overall_summary"]
+        and not args.previous_output_files
+    ):
+        print(f"Error: --previous_output_files is required for '{args.task}' task.")
+        return
+    if args.task in ["extract", "validate", "all_in_one"] and not args.text_column:
+        print(f"Error: --text_column is required for '{args.task}' task.")
+        return
+    start_time = time.time()
+    try:
+        # Task 1: Extract Topics
+        if args.task == "extract":
+            print("--- Starting Topic Extraction Workflow... ---")
+            # Load data file
+            if isinstance(args.input_file, str):
+                args.input_file = [args.input_file]
+            file_data, file_name, total_number_of_batches = load_in_data_file(
+                file_paths=args.input_file,
+                in_colnames=[args.text_column],
+                batch_size=args.batch_size,
+                in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+            )
+            # Prepare candidate topics if provided
+            candidate_topics = None
+            if args.candidate_topics:
+                candidate_topics = args.candidate_topics
+            # Determine summary format prompt
+            summary_format_prompt = (
+                two_para_summary_format_prompt
+                if args.summary_format == "two_paragraph"
+                else single_para_summary_format_prompt
+            )
+            # Run extraction
+            (
+                display_markdown,
+                master_topic_df_state,
+                master_unique_topics_df_state,
+                master_reference_df_state,
+                topic_extraction_output_files,
+                text_output_file_list_state,
+                latest_batch_completed,
+                log_files_output,
+                log_files_output_list_state,
+                conversation_metadata_textbox,
+                estimated_time_taken_number,
+                deduplication_input_files,
+                summarisation_input_files,
+                modifiable_unique_topics_df_state,
+                modification_input_files,
+                in_join_files,
+                missing_df_state,
+                input_tokens_num,
+                output_tokens_num,
+                number_of_calls_num,
+                output_messages_textbox,
+                logged_content_df,
+            ) = wrapper_extract_topics_per_column_value(
+                grouping_col=args.group_by,
+                in_data_file=args.input_file,
+                file_data=file_data,
+                initial_existing_topics_table=pd.DataFrame(),
+                initial_existing_reference_df=pd.DataFrame(),
+                initial_existing_topic_summary_df=pd.DataFrame(),
+                initial_unique_table_df_display_table_markdown="",
+                original_file_name=file_name,
+                total_number_of_batches=total_number_of_batches,
+                in_api_key=args.google_api_key,
+                temperature=args.temperature,
+                chosen_cols=[args.text_column],
+                model_choice=args.model_choice,
+                candidate_topics=candidate_topics,
+                initial_first_loop_state=True,
+                initial_all_metadata_content_str="",
+                initial_latest_batch_completed=0,
+                initial_time_taken=0,
+                batch_size=args.batch_size,
+                context_textbox=args.context,
+                sentiment_checkbox=args.sentiment,
+                force_zero_shot_radio=args.force_zero_shot,
+                in_excel_sheets=args.excel_sheets,
+                force_single_topic_radio=args.force_single_topic,
+                produce_structured_summary_radio=args.produce_structured_summary,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                aws_region_textbox=args.aws_region,
+                hf_api_key_textbox=args.hf_token,
+                azure_api_key_textbox=args.azure_api_key,
+                azure_endpoint_textbox=args.azure_endpoint,
+                output_folder=args.output_dir,
+                existing_logged_content=list(),
+                additional_instructions_summary_format=args.additional_summary_instructions,
+                additional_validation_issues_provided="",
+                show_previous_table="Yes",
+                api_url=args.api_url if args.api_url else API_URL,
+                max_tokens=args.max_tokens,
+                model_name_map=model_name_map,
+                max_time_for_loop=99999,
+                reasoning_suffix="",
+                CHOSEN_LOCAL_MODEL_TYPE="",
+                output_debug_files=str(args.output_debug_files),
+                model=None,
+                tokenizer=None,
+                assistant_model=None,
+                max_rows=999999,
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- Topic Extraction Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if topic_extraction_output_files:
+                print("Generated Files:", sorted(topic_extraction_output_files))
+            # Write usage log (before Excel creation so it can be included in Excel)
+            write_usage_log(
+                session_hash=session_hash,
+                file_name=file_name,
+                text_column=args.text_column,
+                model_choice=args.model_choice,
+                conversation_metadata=conversation_metadata_textbox or "",
+                input_tokens=input_tokens_num or 0,
+                output_tokens=output_tokens_num or 0,
+                number_of_calls=number_of_calls_num or 0,
+                estimated_time_taken=estimated_time_taken_number or processing_time,
+                cost_code=args.cost_code,
+                save_to_csv=args.save_logs_to_csv,
+                save_to_dynamodb=args.save_logs_to_dynamodb,
+            )
+            # Create Excel output if requested
+            xlsx_files = []
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file,
+                        chosen_cols=[args.text_column],
+                        reference_data_file_name_textbox=file_name,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=master_reference_df_state,
+                        master_unique_topics_df_state=master_unique_topics_df_state,
+                        summarised_output_df=pd.DataFrame(),  # No summaries yet
+                        missing_df_state=missing_df_state,
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+            # Upload outputs to S3 if enabled
+            all_output_files = (
+                list(topic_extraction_output_files)
+                if topic_extraction_output_files
+                else []
+            )
+            if xlsx_files:
+                all_output_files.extend(xlsx_files)
+            upload_outputs_to_s3_if_enabled(
+                output_files=all_output_files,
+                base_file_name=file_name,
+                session_hash=session_hash,
+            )
+        # Task 2: Validate Topics
+        elif args.task == "validate":
+            print("--- Starting Topic Validation Workflow... ---")
+            # Load data file
+            if isinstance(args.input_file, str):
+                args.input_file = [args.input_file]
+            file_data, file_name, total_number_of_batches = load_in_data_file(
+                file_paths=args.input_file,
+                in_colnames=[args.text_column],
+                batch_size=args.batch_size,
+                in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+            )
+            # Load previous output files
+            (
+                reference_df,
+                topic_summary_df,
+                latest_batch_completed_no_loop,
+                deduplication_input_files_status,
+                working_data_file_name_textbox,
+                unique_topics_table_file_name_textbox,
+            ) = load_in_previous_data_files(args.previous_output_files)
+            # Run validation
+            (
+                display_markdown,
+                master_topic_df_state,
+                master_unique_topics_df_state,
+                master_reference_df_state,
+                validation_output_files,
+                text_output_file_list_state,
+                latest_batch_completed,
+                log_files_output,
+                log_files_output_list_state,
+                conversation_metadata_textbox,
+                estimated_time_taken_number,
+                deduplication_input_files,
+                summarisation_input_files,
+                modifiable_unique_topics_df_state,
+                modification_input_files,
+                in_join_files,
+                missing_df_state,
+                input_tokens_num,
+                output_tokens_num,
+                number_of_calls_num,
+                output_messages_textbox,
+                logged_content_df,
+            ) = validate_topics_wrapper(
+                file_data=file_data,
+                reference_df=reference_df,
+                topic_summary_df=topic_summary_df,
+                file_name=working_data_file_name_textbox,
+                chosen_cols=[args.text_column],
+                batch_size=args.batch_size,
+                model_choice=args.model_choice,
+                in_api_key=args.google_api_key,
+                temperature=args.temperature,
+                max_tokens=args.max_tokens,
+                azure_api_key_textbox=args.azure_api_key,
+                azure_endpoint_textbox=args.azure_endpoint,
+                reasoning_suffix="",
+                group_name=args.group_by or "All",
+                produce_structured_summary_radio=args.produce_structured_summary,
+                force_zero_shot_radio=args.force_zero_shot,
+                force_single_topic_radio=args.force_single_topic,
+                context_textbox=args.context,
+                additional_instructions_summary_format=args.additional_summary_instructions,
+                output_folder=args.output_dir,
+                output_debug_files=str(args.output_debug_files),
+                original_full_file_name=file_name,
+                additional_validation_issues_provided=args.additional_validation_issues,
+                max_time_for_loop=args.max_time_for_loop,
+                in_data_files=args.input_file,
+                sentiment_checkbox=args.sentiment,
+                logged_content=None,
+                show_previous_table=args.show_previous_table,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                aws_region_textbox=args.aws_region,
+                api_url=args.api_url if args.api_url else API_URL,
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- Topic Validation Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if validation_output_files:
+                print("Generated Files:", sorted(validation_output_files))
+            # Write usage log
+            write_usage_log(
+                session_hash=session_hash,
+                file_name=file_name,
+                text_column=args.text_column,
+                model_choice=args.model_choice,
+                conversation_metadata=conversation_metadata_textbox or "",
+                input_tokens=input_tokens_num or 0,
+                output_tokens=output_tokens_num or 0,
+                number_of_calls=number_of_calls_num or 0,
+                estimated_time_taken=estimated_time_taken_number or processing_time,
+                cost_code=args.cost_code,
+                save_to_csv=args.save_logs_to_csv,
+                save_to_dynamodb=args.save_logs_to_dynamodb,
+            )
+            # Create Excel output if requested
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file,
+                        chosen_cols=[args.text_column],
+                        reference_data_file_name_textbox=file_name,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=master_reference_df_state,
+                        master_unique_topics_df_state=master_unique_topics_df_state,
+                        summarised_output_df=pd.DataFrame(),  # No summaries yet
+                        missing_df_state=missing_df_state,
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+        # Task 3: Deduplicate Topics
+        elif args.task == "deduplicate":
+            print("--- Starting Topic Deduplication Workflow... ---")
+            # Load previous output files
+            (
+                reference_df,
+                topic_summary_df,
+                latest_batch_completed_no_loop,
+                deduplication_input_files_status,
+                working_data_file_name_textbox,
+                unique_topics_table_file_name_textbox,
+            ) = load_in_previous_data_files(args.previous_output_files)
+            if args.method == "fuzzy":
+                # Fuzzy matching deduplication
+                (
+                    ref_df_after_dedup,
+                    unique_df_after_dedup,
+                    summarisation_input_files,
+                    log_files_output,
+                    summarised_output_markdown,
+                ) = deduplicate_topics(
+                    reference_df=reference_df,
+                    topic_summary_df=topic_summary_df,
+                    reference_table_file_name=working_data_file_name_textbox,
+                    unique_topics_table_file_name=unique_topics_table_file_name_textbox,
+                    in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                    merge_sentiment=args.merge_sentiment,
+                    merge_general_topics=args.merge_general_topics,
+                    score_threshold=args.similarity_threshold,
+                    in_data_files=args.input_file if args.input_file else list(),
+                    chosen_cols=[args.text_column] if args.text_column else list(),
+                    output_folder=args.output_dir,
+                )
+            else:
+                # LLM deduplication
+                model_source = model_name_map.get(args.model_choice, {}).get(
+                    "source", default_model_source
+                )
+                (
+                    ref_df_after_dedup,
+                    unique_df_after_dedup,
+                    summarisation_input_files,
+                    log_files_output,
+                    summarised_output_markdown,
+                    input_tokens_num,
+                    output_tokens_num,
+                    number_of_calls_num,
+                    estimated_time_taken_number,
+                ) = deduplicate_topics_llm(
+                    reference_df=reference_df,
+                    topic_summary_df=topic_summary_df,
+                    reference_table_file_name=working_data_file_name_textbox,
+                    unique_topics_table_file_name=unique_topics_table_file_name_textbox,
+                    model_choice=args.model_choice,
+                    in_api_key=args.google_api_key,
+                    temperature=args.temperature,
+                    model_source=model_source,
+                    bedrock_runtime=None,
+                    local_model=None,
+                    tokenizer=None,
+                    assistant_model=None,
+                    in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                    merge_sentiment=args.merge_sentiment,
+                    merge_general_topics=args.merge_general_topics,
+                    in_data_files=args.input_file if args.input_file else list(),
+                    chosen_cols=[args.text_column] if args.text_column else list(),
+                    output_folder=args.output_dir,
+                    candidate_topics=(
+                        args.candidate_topics if args.candidate_topics else None
+                    ),
+                    azure_endpoint=args.azure_endpoint,
+                    output_debug_files=str(args.output_debug_files),
+                    api_url=args.api_url if args.api_url else API_URL,
+                )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- Topic Deduplication Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if summarisation_input_files:
+                print("Generated Files:", sorted(summarisation_input_files))
+            # Write usage log (only for LLM deduplication which has token counts)
+            if args.method == "llm":
+                # Extract token counts from LLM deduplication result
+                llm_input_tokens = (
+                    input_tokens_num if "input_tokens_num" in locals() else 0
+                )
+                llm_output_tokens = (
+                    output_tokens_num if "output_tokens_num" in locals() else 0
+                )
+                llm_calls = (
+                    number_of_calls_num if "number_of_calls_num" in locals() else 0
+                )
+                llm_time = (
+                    estimated_time_taken_number
+                    if "estimated_time_taken_number" in locals()
+                    else processing_time
+                )
+                write_usage_log(
+                    session_hash=session_hash,
+                    file_name=working_data_file_name_textbox,
+                    text_column=args.text_column if args.text_column else "",
+                    model_choice=args.model_choice,
+                    conversation_metadata="",
+                    input_tokens=llm_input_tokens,
+                    output_tokens=llm_output_tokens,
+                    number_of_calls=llm_calls,
+                    estimated_time_taken=llm_time,
+                    cost_code=args.cost_code,
+                    save_to_csv=args.save_logs_to_csv,
+                    save_to_dynamodb=args.save_logs_to_dynamodb,
+                )
+            # Create Excel output if requested
+            xlsx_files = []
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    # Use the deduplicated dataframes
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file if args.input_file else [],
+                        chosen_cols=[args.text_column] if args.text_column else [],
+                        reference_data_file_name_textbox=working_data_file_name_textbox,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=ref_df_after_dedup,
+                        master_unique_topics_df_state=unique_df_after_dedup,
+                        summarised_output_df=pd.DataFrame(),  # No summaries yet
+                        missing_df_state=pd.DataFrame(),
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+            # Upload outputs to S3 if enabled
+            all_output_files = (
+                list(summarisation_input_files) if summarisation_input_files else []
+            )
+            if xlsx_files:
+                all_output_files.extend(xlsx_files)
+            upload_outputs_to_s3_if_enabled(
+                output_files=all_output_files,
+                base_file_name=working_data_file_name_textbox,
+                session_hash=session_hash,
+            )
+        # Task 4: Summarise Topics
+        elif args.task == "summarise":
+            print("--- Starting Topic Summarisation Workflow... ---")
+            # Load previous output files
+            (
+                reference_df,
+                topic_summary_df,
+                latest_batch_completed_no_loop,
+                deduplication_input_files_status,
+                working_data_file_name_textbox,
+                unique_topics_table_file_name_textbox,
+            ) = load_in_previous_data_files(args.previous_output_files)
+            # Determine summary format prompt
+            summary_format_prompt = (
+                two_para_summary_format_prompt
+                if args.summary_format == "two_paragraph"
+                else single_para_summary_format_prompt
+            )
+            # Run summarisation
+            (
+                summary_reference_table_sample_state,
+                master_unique_topics_df_revised_summaries_state,
+                master_reference_df_revised_summaries_state,
+                summary_output_files,
+                summarised_outputs_list,
+                latest_summary_completed_num,
+                conversation_metadata_textbox,
+                summarised_output_markdown,
+                log_files_output,
+                overall_summarisation_input_files,
+                input_tokens_num,
+                output_tokens_num,
+                number_of_calls_num,
+                estimated_time_taken_number,
+                output_messages_textbox,
+                logged_content_df,
+            ) = wrapper_summarise_output_topics_per_group(
+                grouping_col=args.group_by,
+                sampled_reference_table_df=reference_df.copy(),  # Will be sampled if sample_reference_table=True
+                topic_summary_df=topic_summary_df,
+                reference_table_df=reference_df,
+                model_choice=args.model_choice,
+                in_api_key=args.google_api_key,
+                temperature=args.temperature,
+                reference_data_file_name=working_data_file_name_textbox,
+                summarised_outputs=list(),
+                latest_summary_completed=0,
+                out_metadata_str="",
+                in_data_files=args.input_file if args.input_file else list(),
+                in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                chosen_cols=[args.text_column] if args.text_column else list(),
+                log_output_files=list(),
+                summarise_format_radio=summary_format_prompt,
+                output_folder=args.output_dir,
+                context_textbox=args.context,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                aws_region_textbox=args.aws_region,
+                model_name_map=model_name_map,
+                hf_api_key_textbox=args.hf_token,
+                azure_endpoint_textbox=args.azure_endpoint,
+                existing_logged_content=list(),
+                sample_reference_table=args.sample_reference_table,
+                no_of_sampled_summaries=args.no_of_sampled_summaries,
+                random_seed=args.random_seed,
+                api_url=args.api_url if args.api_url else API_URL,
+                additional_summary_instructions_provided=args.additional_summary_instructions,
+                output_debug_files=str(args.output_debug_files),
+                reasoning_suffix="",
+                local_model=None,
+                tokenizer=None,
+                assistant_model=None,
+                do_summaries="Yes",
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- Topic Summarisation Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if summary_output_files:
+                print("Generated Files:", sorted(summary_output_files))
+            # Write usage log
+            write_usage_log(
+                session_hash=session_hash,
+                file_name=working_data_file_name_textbox,
+                text_column=args.text_column if args.text_column else "",
+                model_choice=args.model_choice,
+                conversation_metadata=conversation_metadata_textbox or "",
+                input_tokens=input_tokens_num or 0,
+                output_tokens=output_tokens_num or 0,
+                number_of_calls=number_of_calls_num or 0,
+                estimated_time_taken=estimated_time_taken_number or processing_time,
+                cost_code=args.cost_code,
+                save_to_csv=args.save_logs_to_csv,
+                save_to_dynamodb=args.save_logs_to_dynamodb,
+            )
+            # Create Excel output if requested
+            xlsx_files = []
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file if args.input_file else [],
+                        chosen_cols=[args.text_column] if args.text_column else [],
+                        reference_data_file_name_textbox=working_data_file_name_textbox,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=master_reference_df_revised_summaries_state,
+                        master_unique_topics_df_state=master_unique_topics_df_revised_summaries_state,
+                        summarised_output_df=pd.DataFrame(),  # Summaries are in the revised dataframes
+                        missing_df_state=pd.DataFrame(),
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+            # Upload outputs to S3 if enabled
+            all_output_files = (
+                list(summary_output_files) if summary_output_files else []
+            )
+            if xlsx_files:
+                all_output_files.extend(xlsx_files)
+            upload_outputs_to_s3_if_enabled(
+                output_files=all_output_files,
+                base_file_name=working_data_file_name_textbox,
+                session_hash=session_hash,
+            )
+        # Task 5: Overall Summary
+        elif args.task == "overall_summary":
+            print("--- Starting Overall Summary Workflow... ---")
+            # Load previous output files
+            (
+                reference_df,
+                topic_summary_df,
+                latest_batch_completed_no_loop,
+                deduplication_input_files_status,
+                working_data_file_name_textbox,
+                unique_topics_table_file_name_textbox,
+            ) = load_in_previous_data_files(args.previous_output_files)
+            # Run overall summary
+            (
+                overall_summary_output_files,
+                overall_summarised_output_markdown,
+                summarised_output_df,
+                conversation_metadata_textbox,
+                input_tokens_num,
+                output_tokens_num,
+                number_of_calls_num,
+                estimated_time_taken_number,
+                output_messages_textbox,
+                logged_content_df,
+            ) = overall_summary(
+                topic_summary_df=topic_summary_df,
+                model_choice=args.model_choice,
+                in_api_key=args.google_api_key,
+                temperature=args.temperature,
+                reference_data_file_name=working_data_file_name_textbox,
+                output_folder=args.output_dir,
+                chosen_cols=[args.text_column] if args.text_column else list(),
+                context_textbox=args.context,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                aws_region_textbox=args.aws_region,
+                model_name_map=model_name_map,
+                hf_api_key_textbox=args.hf_token,
+                azure_endpoint_textbox=args.azure_endpoint,
+                existing_logged_content=list(),
+                api_url=args.api_url if args.api_url else API_URL,
+                output_debug_files=str(args.output_debug_files),
+                log_output_files=list(),
+                reasoning_suffix="",
+                local_model=None,
+                tokenizer=None,
+                assistant_model=None,
+                do_summaries="Yes",
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- Overall Summary Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if overall_summary_output_files:
+                print("Generated Files:", sorted(overall_summary_output_files))
+            # Write usage log
+            write_usage_log(
+                session_hash=session_hash,
+                file_name=working_data_file_name_textbox,
+                text_column=args.text_column if args.text_column else "",
+                model_choice=args.model_choice,
+                conversation_metadata=conversation_metadata_textbox or "",
+                input_tokens=input_tokens_num or 0,
+                output_tokens=output_tokens_num or 0,
+                number_of_calls=number_of_calls_num or 0,
+                estimated_time_taken=estimated_time_taken_number or processing_time,
+                cost_code=args.cost_code,
+                save_to_csv=args.save_logs_to_csv,
+                save_to_dynamodb=args.save_logs_to_dynamodb,
+            )
+            # Create Excel output if requested
+            xlsx_files = []
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file if args.input_file else [],
+                        chosen_cols=[args.text_column] if args.text_column else [],
+                        reference_data_file_name_textbox=working_data_file_name_textbox,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=reference_df,  # Use original reference_df
+                        master_unique_topics_df_state=topic_summary_df,  # Use original topic_summary_df
+                        summarised_output_df=summarised_output_df,
+                        missing_df_state=pd.DataFrame(),
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+            # Upload outputs to S3 if enabled
+            all_output_files = (
+                list(overall_summary_output_files)
+                if overall_summary_output_files
+                else []
+            )
+            if xlsx_files:
+                all_output_files.extend(xlsx_files)
+            upload_outputs_to_s3_if_enabled(
+                output_files=all_output_files,
+                base_file_name=working_data_file_name_textbox,
+                session_hash=session_hash,
+            )
+        # Task 6: All-in-One Pipeline
+        elif args.task == "all_in_one":
+            print("--- Starting All-in-One Pipeline Workflow... ---")
+            # Load data file
+            if isinstance(args.input_file, str):
+                args.input_file = [args.input_file]
+            file_data, file_name, total_number_of_batches = load_in_data_file(
+                file_paths=args.input_file,
+                in_colnames=[args.text_column],
+                batch_size=args.batch_size,
+                in_excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+            )
+            # Prepare candidate topics if provided
+            candidate_topics = None
+            if args.candidate_topics:
+                candidate_topics = args.candidate_topics
+            # Determine summary format prompt
+            summary_format_prompt = (
+                two_para_summary_format_prompt
+                if args.summary_format == "two_paragraph"
+                else single_para_summary_format_prompt
+            )
+            # Run all-in-one pipeline
+            (
+                display_markdown,
+                master_topic_df_state,
+                master_unique_topics_df_state,
+                master_reference_df_state,
+                topic_extraction_output_files,
+                text_output_file_list_state,
+                latest_batch_completed,
+                log_files_output,
+                log_files_output_list_state,
+                conversation_metadata_textbox,
+                estimated_time_taken_number,
+                deduplication_input_files,
+                summarisation_input_files,
+                modifiable_unique_topics_df_state,
+                modification_input_files,
+                in_join_files,
+                missing_df_state,
+                input_tokens_num,
+                output_tokens_num,
+                number_of_calls_num,
+                output_messages_textbox,
+                summary_reference_table_sample_state,
+                summarised_references_markdown,
+                master_unique_topics_df_revised_summaries_state,
+                master_reference_df_revised_summaries_state,
+                summary_output_files,
+                summarised_outputs_list,
+                latest_summary_completed_num,
+                overall_summarisation_input_files,
+                overall_summary_output_files,
+                overall_summarised_output_markdown,
+                summarised_output_df,
+                logged_content_df,
+            ) = all_in_one_pipeline(
+                grouping_col=args.group_by,
+                in_data_files=args.input_file,
+                file_data=file_data,
+                existing_topics_table=pd.DataFrame(),
+                existing_reference_df=pd.DataFrame(),
+                existing_topic_summary_df=pd.DataFrame(),
+                unique_table_df_display_table_markdown="",
+                original_file_name=file_name,
+                total_number_of_batches=total_number_of_batches,
+                in_api_key=args.google_api_key,
+                temperature=args.temperature,
+                chosen_cols=[args.text_column],
+                model_choice=args.model_choice,
+                candidate_topics=candidate_topics,
+                first_loop_state=True,
+                conversation_metadata_text="",
+                latest_batch_completed=0,
+                time_taken_so_far=0,
+                initial_table_prompt_text=initial_table_prompt,
+                initial_table_system_prompt_text=initial_table_system_prompt,
+                add_existing_topics_system_prompt_text=add_existing_topics_system_prompt,
+                add_existing_topics_prompt_text=add_existing_topics_prompt,
+                number_of_prompts_used=1,
+                batch_size=args.batch_size,
+                context_text=args.context,
+                sentiment_choice=args.sentiment,
+                force_zero_shot_choice=args.force_zero_shot,
+                in_excel_sheets=args.excel_sheets,
+                force_single_topic_choice=args.force_single_topic,
+                produce_structures_summary_choice=args.produce_structured_summary,
+                aws_access_key_text=args.aws_access_key,
+                aws_secret_key_text=args.aws_secret_key,
+                aws_region_text=args.aws_region,
+                hf_api_key_text=args.hf_token,
+                azure_api_key_text=args.azure_api_key,
+                azure_endpoint_text=args.azure_endpoint,
+                output_folder=args.output_dir,
+                merge_sentiment=args.merge_sentiment,
+                merge_general_topics=args.merge_general_topics,
+                score_threshold=args.similarity_threshold,
+                summarise_format=summary_format_prompt,
+                random_seed=args.random_seed,
+                log_files_output_list_state=list(),
+                model_name_map_state=model_name_map,
+                usage_logs_location=(
+                    os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                    if args.save_logs_to_csv
+                    else ""
+                ),
+                existing_logged_content=list(),
+                additional_instructions_summary_format=args.additional_summary_instructions,
+                additional_validation_issues_provided="",
+                show_previous_table="Yes",
+                sample_reference_table_checkbox=args.sample_reference_table,
+                api_url=args.api_url if args.api_url else API_URL,
+                output_debug_files=str(args.output_debug_files),
+                model=None,
+                tokenizer=None,
+                assistant_model=None,
+                max_rows=999999,
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            print("\n--- All-in-One Pipeline Complete ---")
+            print(f"Processing time: {processing_time:.2f} seconds")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            if overall_summary_output_files:
+                print("Generated Files:", sorted(overall_summary_output_files))
+            # Write usage log
+            write_usage_log(
+                session_hash=session_hash,
+                file_name=file_name,
+                text_column=args.text_column,
+                model_choice=args.model_choice,
+                conversation_metadata=conversation_metadata_textbox or "",
+                input_tokens=input_tokens_num or 0,
+                output_tokens=output_tokens_num or 0,
+                number_of_calls=number_of_calls_num or 0,
+                estimated_time_taken=estimated_time_taken_number or processing_time,
+                cost_code=args.cost_code,
+                save_to_csv=args.save_logs_to_csv,
+                save_to_dynamodb=args.save_logs_to_dynamodb,
+            )
+            # Create Excel output if requested
+            xlsx_files = []
+            if args.create_xlsx_output:
+                print("\nCreating Excel output file...")
+                try:
+                    xlsx_files, _ = collect_output_csvs_and_create_excel_output(
+                        in_data_files=args.input_file,
+                        chosen_cols=[args.text_column],
+                        reference_data_file_name_textbox=file_name,
+                        in_group_col=args.group_by,
+                        model_choice=args.model_choice,
+                        master_reference_df_state=master_reference_df_revised_summaries_state,
+                        master_unique_topics_df_state=master_unique_topics_df_revised_summaries_state,
+                        summarised_output_df=summarised_output_df,
+                        missing_df_state=missing_df_state,
+                        excel_sheets=args.excel_sheets[0] if args.excel_sheets else "",
+                        usage_logs_location=(
+                            os.path.join(USAGE_LOGS_FOLDER, USAGE_LOG_FILE_NAME)
+                            if args.save_logs_to_csv
+                            else ""
+                        ),
+                        model_name_map=model_name_map,
+                        output_folder=args.output_dir,
+                        structured_summaries=args.produce_structured_summary,
+                    )
+                    if xlsx_files:
+                        print(f"Excel output created: {sorted(xlsx_files)}")
+                except Exception as e:
+                    print(f"Warning: Could not create Excel output: {e}")
+            # Upload outputs to S3 if enabled
+            # Collect all output files from the pipeline
+            all_output_files = []
+            if topic_extraction_output_files:
+                all_output_files.extend(topic_extraction_output_files)
+            if overall_summary_output_files:
+                all_output_files.extend(overall_summary_output_files)
+            if xlsx_files:
+                all_output_files.extend(xlsx_files)
+            upload_outputs_to_s3_if_enabled(
+                output_files=all_output_files,
+                base_file_name=file_name,
+                session_hash=session_hash,
+            )
+        else:
+            print(f"Error: Invalid task '{args.task}'.")
+            print(
+                "Valid options: 'extract', 'validate', 'deduplicate', 'summarise', 'overall_summary', or 'all_in_one'"
+            )
+    except Exception as e:
+        print(f"\nAn error occurred during the workflow: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/sh
+# Exit immediately if a command exits with a non-zero status.
+set -e
+echo "Starting in APP_MODE: $APP_MODE"
+# --- Start the app based on mode ---
+if [ "$APP_MODE" = "lambda" ]; then
+    echo "Starting in Lambda mode..."
+    # The CMD from Dockerfile will be passed as "$@"
+    exec python -m awslambdaric "$@"
+else
+    echo "Starting in Gradio mode..."
+    exec python app.py
+fi

example_data/case_note_headers_specific.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+General Topic,Subtopic
+Mental health,Anger
+Mental health,Social issues
+Physical health,General
+Physical health,Substance misuse
+Behaviour at school,Behaviour at school
+Trends over time,Trends over time

example_data/combined_case_notes.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+Date,Social Worker,Client,Case Note
+"January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
+"February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
+"February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
+"March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
+"March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
+"April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
+"April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
+"May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
+"January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
+"February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
+"February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
+"March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
+"March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
+"April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
+"April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
+"May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."

example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:322a081b29d4fb40ccae7d47aa74fda772a002eda576ddc98d6acc86366cff11
+size 13502

example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3dcc1ea155169c23d913043b1ad87da2f2912be36d9fb1521c72ee05b8dcf36
+size 25299

example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e1eaede9af75b6ab695b1cfc6c01ec875abf14521249ba7257bd4bb0afd7ee8
+size 28673

example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30947f1355eacc74c92d09b766e8e3d71092b9a240e7f8acd381874b7d7ebcb3
+size 24673

example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5f0e36143d8362391e3b11d1c20e3a2a1b7536b8f0c972e3d44644eb9ae4e82
+size 27592

example_data/dummy_consultation_response.csv ADDED Viewed

	@@ -0,0 +1,31 @@

+Response Reference,Object to or Support application,Response text
+R1,Object,I strongly object to the proposed five-storey apartment block on Main Street. It is completely out of keeping with the existing character of the area and will overshadow the existing buildings.
+R2,Support,"I fully support the proposed development. The town needs more housing, and this development will provide much-needed homes."
+R3,Object,The proposed development is too tall and will have a negative impact on the views from the surrounding area.
+R4,Object,The loss of the well-loved cafe will be a great loss to the community.
+R5,Support,The development will bring much-needed investment to the area and create jobs.
+R6,Object,The increased traffic generated by the development will cause congestion on Main Street.
+R7,Support,The development will provide much-needed affordable housing.
+R8,Object,The development will have a negative impact on the local environment.
+R9,Support,The development will improve the appearance of Main Street.
+R10,Object,The development will overshadow the existing buildings and make them feel cramped.
+R11,Support,The development will provide much-needed amenities for the local community.
+R12,Object,The development will have a negative impact on the local wildlife.
+R13,Support,The development will help to revitalise the town centre.
+R14,Object,The development will increase noise pollution in the area.
+R15,Support,The development will provide much-needed parking spaces.
+R16,Object,The development will have a negative impact on the local businesses.
+R17,Support,The development will provide much-needed green space.
+R18,Object,The development will have a negative impact on the local heritage.
+R19,Support,The development will provide much-needed facilities for young people.
+R20,Object,The development will have a negative impact on the local schools.
+R21,Support,The development will provide much-needed social housing.
+R22,Object,The development will have a negative impact on the local infrastructure.
+R23,Support,The development will provide much-needed jobs for local people.
+R24,Object,The development will have a negative impact on the local economy.
+R25,Support,The development will provide much-needed community facilities.
+R26,Object,The development will have a negative impact on the local amenities.
+R27,Support,The development will provide much-needed housing for young people.
+R28,Object,The development will have a negative impact on the local character.
+R29,Support,The development will provide much-needed housing for families.
+R30,Object,The development will have a negative impact on the local quality of life.

example_data/dummy_consultation_response_themes.csv ADDED Viewed

	@@ -0,0 +1,26 @@

+topics
+Need for family housing
+Impact on the character of the area
+Amenities for the local community
+Revitalisation of the town centre
+Impact on local wildlife
+Parking
+Impact on local businesses
+Green space
+Noise pollution
+Impact on local heritage
+Facilities for young people
+Impact on local schools
+Impact on views
+Loss of cafe
+Investment and job creation
+Traffic congestion
+Affordable housing
+Impact on the local environment
+Improvement of main street
+Impact on local infrastructure
+Investment and job creation
+Impact on local schools
+Provision of community facilities
+Impact on local heritage
+Impact on quality of life

intros/intro.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# Create thematic summaries from your data
+Extract topics and summarise open text using Large Language Models (LLMs). The model will loop through all text rows to find the most relevant general topics and subtopics, and provide a short summary of each. If you have specific topics in mind, you can enter them in 'Provide a list of specific topics' below.
+NOTE: LLMs are not 100% accurate and may produce biased or incorrect responses. All files downloaded from this app **need to be checked by a human** before they are used in further outputs.
+Unsure of how to use this app? Try an example by clicking on one of the example datasets below to see typical outputs the app can produce. There is also a user guide provided alongside this app - please ask your system administrator if you do not have access.

lambda_entrypoint.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import json
+import os
+import boto3
+from dotenv import load_dotenv
+# Import the main function from your CLI script
+from cli_topics import main as cli_main
+from tools.config import (
+    AWS_REGION,
+    BATCH_SIZE_DEFAULT,
+    DEDUPLICATION_THRESHOLD,
+    DEFAULT_COST_CODE,
+    DEFAULT_SAMPLED_SUMMARIES,
+    LLM_MAX_NEW_TOKENS,
+    LLM_SEED,
+    LLM_TEMPERATURE,
+    OUTPUT_DEBUG_FILES,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    SESSION_OUTPUT_FOLDER,
+    USAGE_LOGS_FOLDER,
+    convert_string_to_boolean,
+)
+def _get_env_list(env_var_name: str | list[str] | None) -> list[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    if isinstance(env_var_name, list):
+        return env_var_name
+    if env_var_name is None:
+        return []
+    # Handle string input
+    value = str(env_var_name).strip()
+    if not value or value == "[]":
+        return []
+    # Remove brackets if present (e.g., "[item1, item2]" -> "item1, item2")
+    if value.startswith("[") and value.endswith("]"):
+        value = value[1:-1]
+    # Remove quotes and split by comma
+    value = value.replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings
+    return [s.strip() for s in value.split(",") if s.strip()]
+print("Lambda entrypoint loading...")
+# Initialize S3 client outside the handler for connection reuse
+s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", AWS_REGION))
+print("S3 client initialised")
+# Lambda's only writable directory is /tmp. Ensure that all temporary files are stored in this directory.
+TMP_DIR = "/tmp"
+INPUT_DIR = os.path.join(TMP_DIR, "input")
+OUTPUT_DIR = os.path.join(TMP_DIR, "output")
+os.environ["GRADIO_TEMP_DIR"] = os.path.join(TMP_DIR, "gradio_tmp")
+os.environ["MPLCONFIGDIR"] = os.path.join(TMP_DIR, "matplotlib_cache")
+os.environ["FEEDBACK_LOGS_FOLDER"] = os.path.join(TMP_DIR, "feedback")
+os.environ["ACCESS_LOGS_FOLDER"] = os.path.join(TMP_DIR, "logs")
+os.environ["USAGE_LOGS_FOLDER"] = os.path.join(TMP_DIR, "usage")
+# Define compatible file types for processing
+COMPATIBLE_FILE_TYPES = {
+    ".csv",
+    ".xlsx",
+    ".xls",
+    ".parquet",
+}
+def download_file_from_s3(bucket_name, key, download_path):
+    """Download a file from S3 to the local filesystem."""
+    try:
+        s3_client.download_file(bucket_name, key, download_path)
+        print(f"Successfully downloaded s3://{bucket_name}/{key} to {download_path}")
+    except Exception as e:
+        print(f"Error downloading from S3: {e}")
+        raise
+def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
+    """Upload all files from a local directory to an S3 prefix."""
+    for root, _, files in os.walk(local_directory):
+        for file_name in files:
+            local_file_path = os.path.join(root, file_name)
+            # Create a relative path to maintain directory structure if needed
+            relative_path = os.path.relpath(local_file_path, local_directory)
+            output_key = os.path.join(s3_prefix, relative_path).replace("\\", "/")
+            try:
+                s3_client.upload_file(local_file_path, bucket_name, output_key)
+                print(
+                    f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}"
+                )
+            except Exception as e:
+                print(f"Error uploading to S3: {e}")
+                raise
+def lambda_handler(event, context):
+    print(f"Received event: {json.dumps(event)}")
+    # 1. Setup temporary directories
+    os.makedirs(INPUT_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # 2. Extract information from the event
+    # Assumes the event is triggered by S3 and may contain an 'arguments' payload
+    try:
+        record = event["Records"][0]
+        bucket_name = record["s3"]["bucket"]["name"]
+        input_key = record["s3"]["object"]["key"]
+        # The user metadata can be used to pass arguments
+        # This is more robust than embedding them in the main event body
+        try:
+            response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
+            metadata = response.get("Metadata", dict())
+            print(f"S3 object metadata: {metadata}")
+            # Arguments can be passed as a JSON string in metadata
+            arguments_str = metadata.get("arguments", "{}")
+            print(f"Arguments string from metadata: '{arguments_str}'")
+            if arguments_str and arguments_str != "{}":
+                arguments = json.loads(arguments_str)
+                print(f"Successfully parsed arguments from metadata: {arguments}")
+            else:
+                arguments = dict()
+                print("No arguments found in metadata, using empty dictionary")
+        except Exception as e:
+            print(f"Warning: Could not parse metadata arguments: {e}")
+            print("Using empty arguments dictionary")
+            arguments = dict()
+    except (KeyError, IndexError) as e:
+        print(
+            f"Could not parse S3 event record: {e}. Checking for direct invocation payload."
+        )
+        # Fallback for direct invocation (e.g., from Step Functions or manual test)
+        bucket_name = event.get("bucket_name")
+        input_key = event.get("input_key")
+        arguments = event.get("arguments", dict())
+        if not all([bucket_name, input_key]):
+            raise ValueError(
+                "Missing 'bucket_name' or 'input_key' in direct invocation event."
+            )
+    # Log file type information
+    file_extension = os.path.splitext(input_key)[1].lower()
+    print(f"Detected file extension: '{file_extension}'")
+    # 3. Download the main input file
+    input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
+    download_file_from_s3(bucket_name, input_key, input_file_path)
+    # 3.1. Validate file type compatibility
+    is_env_file = input_key.lower().endswith(".env")
+    if not is_env_file and file_extension not in COMPATIBLE_FILE_TYPES:
+        error_message = f"File type '{file_extension}' is not supported for processing. Compatible file types are: {', '.join(sorted(COMPATIBLE_FILE_TYPES))}"
+        print(f"ERROR: {error_message}")
+        print(f"File was not processed due to unsupported file type: {file_extension}")
+        return {
+            "statusCode": 400,
+            "body": json.dumps(
+                {
+                    "error": "Unsupported file type",
+                    "message": error_message,
+                    "supported_types": list(COMPATIBLE_FILE_TYPES),
+                    "received_type": file_extension,
+                    "file_processed": False,
+                }
+            ),
+        }
+    print(f"File type '{file_extension}' is compatible for processing")
+    if is_env_file:
+        print("Processing .env file for configuration")
+    else:
+        print(f"Processing {file_extension} file for topic modelling")
+    # 3.5. Check if the downloaded file is a .env file and handle accordingly
+    actual_input_file_path = input_file_path
+    if input_key.lower().endswith(".env"):
+        print("Detected .env file, loading environment variables...")
+        # Load environment variables from the .env file
+        print(f"Loading .env file from: {input_file_path}")
+        # Check if file exists and is readable
+        if os.path.exists(input_file_path):
+            print(".env file exists and is readable")
+            with open(input_file_path, "r") as f:
+                content = f.read()
+                print(f".env file content preview: {content[:200]}...")
+        else:
+            print(f"ERROR: .env file does not exist at {input_file_path}")
+        load_dotenv(input_file_path, override=True)
+        print("Environment variables loaded from .env file")
+        # Extract the actual input file path from environment variables
+        env_input_file = os.getenv("INPUT_FILE")
+        if env_input_file:
+            print(f"Found input file path in environment: {env_input_file}")
+            # If the path is an S3 path, download it
+            if env_input_file.startswith("s3://"):
+                # Parse S3 path: s3://bucket/key
+                s3_path_parts = env_input_file[5:].split("/", 1)
+                if len(s3_path_parts) == 2:
+                    env_bucket = s3_path_parts[0]
+                    env_key = s3_path_parts[1]
+                    actual_input_file_path = os.path.join(
+                        INPUT_DIR, os.path.basename(env_key)
+                    )
+                    print(
+                        f"Downloading actual input file from s3://{env_bucket}/{env_key}"
+                    )
+                    download_file_from_s3(env_bucket, env_key, actual_input_file_path)
+                else:
+                    print("Warning: Invalid S3 path format in environment variable")
+                    actual_input_file_path = input_file_path
+            else:
+                # Assume it's a local path or relative path
+                actual_input_file_path = env_input_file
+                print(
+                    f"Using input file path from environment: {actual_input_file_path}"
+                )
+        else:
+            print("Warning: No input file path found in environment variables")
+            # Fall back to using the .env file itself (though this might not be what we want)
+            actual_input_file_path = input_file_path
+    else:
+        print("File is not a .env file, proceeding with normal processing")
+    # 4. Prepare arguments for the CLI function
+    # This dictionary should mirror the arguments that cli_topics.main() expects via direct_mode_args
+    cli_args = {
+        # Task Selection
+        "task": arguments.get("task", os.getenv("DIRECT_MODE_TASK", "extract")),
+        # General Arguments
+        "input_file": [actual_input_file_path] if actual_input_file_path else None,
+        "output_dir": arguments.get(
+            "output_dir", os.getenv("DIRECT_MODE_OUTPUT_DIR", OUTPUT_DIR)
+        ),
+        "input_dir": arguments.get("input_dir", INPUT_DIR),
+        "text_column": arguments.get(
+            "text_column", os.getenv("DIRECT_MODE_TEXT_COLUMN", "")
+        ),
+        "previous_output_files": _get_env_list(
+            arguments.get(
+                "previous_output_files",
+                os.getenv("DIRECT_MODE_PREVIOUS_OUTPUT_FILES", list()),
+            )
+        ),
+        "username": arguments.get("username", os.getenv("DIRECT_MODE_USERNAME", "")),
+        "save_to_user_folders": convert_string_to_boolean(
+            arguments.get(
+                "save_to_user_folders",
+                os.getenv("SESSION_OUTPUT_FOLDER", str(SESSION_OUTPUT_FOLDER)),
+            )
+        ),
+        "excel_sheets": _get_env_list(
+            arguments.get("excel_sheets", os.getenv("DIRECT_MODE_EXCEL_SHEETS", list()))
+        ),
+        "group_by": arguments.get("group_by", os.getenv("DIRECT_MODE_GROUP_BY", "")),
+        # Model Configuration
+        "model_choice": arguments.get(
+            "model_choice", os.getenv("DIRECT_MODE_MODEL_CHOICE", "")
+        ),
+        "temperature": float(
+            arguments.get(
+                "temperature",
+                os.getenv("DIRECT_MODE_TEMPERATURE", str(LLM_TEMPERATURE)),
+            )
+        ),
+        "batch_size": int(
+            arguments.get(
+                "batch_size",
+                os.getenv("DIRECT_MODE_BATCH_SIZE", str(BATCH_SIZE_DEFAULT)),
+            )
+        ),
+        "max_tokens": int(
+            arguments.get(
+                "max_tokens",
+                os.getenv("DIRECT_MODE_MAX_TOKENS", str(LLM_MAX_NEW_TOKENS)),
+            )
+        ),
+        "google_api_key": arguments.get(
+            "google_api_key", os.getenv("GEMINI_API_KEY", "")
+        ),
+        "aws_access_key": None,  # Use IAM Role instead of keys
+        "aws_secret_key": None,  # Use IAM Role instead of keys
+        "aws_region": os.getenv("AWS_REGION", AWS_REGION),
+        "hf_token": arguments.get("hf_token", os.getenv("HF_TOKEN", "")),
+        "azure_api_key": arguments.get(
+            "azure_api_key", os.getenv("AZURE_OPENAI_API_KEY", "")
+        ),
+        "azure_endpoint": arguments.get(
+            "azure_endpoint", os.getenv("AZURE_OPENAI_INFERENCE_ENDPOINT", "")
+        ),
+        "api_url": arguments.get("api_url", os.getenv("API_URL", "")),
+        "inference_server_model": arguments.get(
+            "inference_server_model", os.getenv("CHOSEN_INFERENCE_SERVER_MODEL", "")
+        ),
+        # Topic Extraction Arguments
+        "context": arguments.get("context", os.getenv("DIRECT_MODE_CONTEXT", "")),
+        "candidate_topics": arguments.get(
+            "candidate_topics", os.getenv("DIRECT_MODE_CANDIDATE_TOPICS", "")
+        ),
+        "force_zero_shot": arguments.get(
+            "force_zero_shot", os.getenv("DIRECT_MODE_FORCE_ZERO_SHOT", "No")
+        ),
+        "force_single_topic": arguments.get(
+            "force_single_topic", os.getenv("DIRECT_MODE_FORCE_SINGLE_TOPIC", "No")
+        ),
+        "produce_structured_summary": arguments.get(
+            "produce_structured_summary",
+            os.getenv("DIRECT_MODE_PRODUCE_STRUCTURED_SUMMARY", "No"),
+        ),
+        "sentiment": arguments.get(
+            "sentiment", os.getenv("DIRECT_MODE_SENTIMENT", "Negative or Positive")
+        ),
+        "additional_summary_instructions": arguments.get(
+            "additional_summary_instructions",
+            os.getenv("DIRECT_MODE_ADDITIONAL_SUMMARY_INSTRUCTIONS", ""),
+        ),
+        # Validation Arguments
+        "additional_validation_issues": arguments.get(
+            "additional_validation_issues",
+            os.getenv("DIRECT_MODE_ADDITIONAL_VALIDATION_ISSUES", ""),
+        ),
+        "show_previous_table": arguments.get(
+            "show_previous_table", os.getenv("DIRECT_MODE_SHOW_PREVIOUS_TABLE", "Yes")
+        ),
+        "output_debug_files": arguments.get(
+            "output_debug_files", str(OUTPUT_DEBUG_FILES)
+        ),
+        "max_time_for_loop": int(
+            arguments.get("max_time_for_loop", os.getenv("MAX_TIME_FOR_LOOP", "99999"))
+        ),
+        # Deduplication Arguments
+        "method": arguments.get(
+            "method", os.getenv("DIRECT_MODE_DEDUPLICATION_METHOD", "fuzzy")
+        ),
+        "similarity_threshold": int(
+            arguments.get(
+                "similarity_threshold",
+                os.getenv("DEDUPLICATION_THRESHOLD", DEDUPLICATION_THRESHOLD),
+            )
+        ),
+        "merge_sentiment": arguments.get(
+            "merge_sentiment", os.getenv("DIRECT_MODE_MERGE_SENTIMENT", "No")
+        ),
+        "merge_general_topics": arguments.get(
+            "merge_general_topics", os.getenv("DIRECT_MODE_MERGE_GENERAL_TOPICS", "Yes")
+        ),
+        # Summarisation Arguments
+        "summary_format": arguments.get(
+            "summary_format", os.getenv("DIRECT_MODE_SUMMARY_FORMAT", "two_paragraph")
+        ),
+        "sample_reference_table": arguments.get(
+            "sample_reference_table",
+            os.getenv("DIRECT_MODE_SAMPLE_REFERENCE_TABLE", "True"),
+        ),
+        "no_of_sampled_summaries": int(
+            arguments.get(
+                "no_of_sampled_summaries",
+                os.getenv("DEFAULT_SAMPLED_SUMMARIES", DEFAULT_SAMPLED_SUMMARIES),
+            )
+        ),
+        "random_seed": int(
+            arguments.get("random_seed", os.getenv("LLM_SEED", LLM_SEED))
+        ),
+        # Output Format Arguments
+        "create_xlsx_output": convert_string_to_boolean(
+            arguments.get(
+                "create_xlsx_output",
+                os.getenv("DIRECT_MODE_CREATE_XLSX_OUTPUT", "True"),
+            )
+        ),
+        # Logging Arguments
+        "save_logs_to_csv": convert_string_to_boolean(
+            arguments.get(
+                "save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", str(SAVE_LOGS_TO_CSV))
+            )
+        ),
+        "save_logs_to_dynamodb": convert_string_to_boolean(
+            arguments.get(
+                "save_logs_to_dynamodb",
+                os.getenv("SAVE_LOGS_TO_DYNAMODB", str(SAVE_LOGS_TO_DYNAMODB)),
+            )
+        ),
+        "usage_logs_folder": arguments.get("usage_logs_folder", USAGE_LOGS_FOLDER),
+        "cost_code": arguments.get(
+            "cost_code", os.getenv("DEFAULT_COST_CODE", DEFAULT_COST_CODE)
+        ),
+    }
+    # Download optional files if they are specified
+    candidate_topics_key = arguments.get("candidate_topics_s3_key")
+    if candidate_topics_key:
+        candidate_topics_path = os.path.join(INPUT_DIR, "candidate_topics.csv")
+        download_file_from_s3(bucket_name, candidate_topics_key, candidate_topics_path)
+        cli_args["candidate_topics"] = candidate_topics_path
+    # Download previous output files if they are S3 keys
+    if cli_args["previous_output_files"]:
+        downloaded_previous_files = []
+        for prev_file in cli_args["previous_output_files"]:
+            if prev_file.startswith("s3://"):
+                # Parse S3 path
+                s3_path_parts = prev_file[5:].split("/", 1)
+                if len(s3_path_parts) == 2:
+                    prev_bucket = s3_path_parts[0]
+                    prev_key = s3_path_parts[1]
+                    local_prev_path = os.path.join(
+                        INPUT_DIR, os.path.basename(prev_key)
+                    )
+                    download_file_from_s3(prev_bucket, prev_key, local_prev_path)
+                    downloaded_previous_files.append(local_prev_path)
+                else:
+                    downloaded_previous_files.append(prev_file)
+            else:
+                downloaded_previous_files.append(prev_file)
+        cli_args["previous_output_files"] = downloaded_previous_files
+    # 5. Execute the main application logic
+    try:
+        print("--- Starting CLI Topics Main Function ---")
+        print(
+            f"Arguments passed to cli_main: {json.dumps({k: v for k, v in cli_args.items() if k not in ['aws_access_key', 'aws_secret_key']}, default=str)}"
+        )
+        cli_main(direct_mode_args=cli_args)
+        print("--- CLI Topics Main Function Finished ---")
+    except Exception as e:
+        print(f"An error occurred during CLI execution: {e}")
+        import traceback
+        traceback.print_exc()
+        # Optionally, re-raise the exception to make the Lambda fail
+        raise
+    # 6. Upload results back to S3
+    output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
+    print(
+        f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/"
+    )
+    upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
+    return {
+        "statusCode": 200,
+        "body": json.dumps(
+            f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/"
+        ),
+    }

load_dynamo_logs.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import csv
+import datetime
+from decimal import Decimal
+import boto3
+from tools.config import (
+    AWS_REGION,
+    OUTPUT_FOLDER,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
+# Replace with your actual table name and region
+TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME  # Choose as appropriate
+REGION = AWS_REGION
+CSV_OUTPUT = OUTPUT_FOLDER + "dynamodb_logs_export.csv"
+# Create DynamoDB resource
+dynamodb = boto3.resource("dynamodb", region_name=REGION)
+table = dynamodb.Table(TABLE_NAME)
+# Helper function to convert Decimal to float or int
+def convert_types(item):
+    new_item = {}
+    for key, value in item.items():
+        # Handle Decimals first
+        if isinstance(value, Decimal):
+            new_item[key] = int(value) if value % 1 == 0 else float(value)
+        # Handle Strings that might be dates
+        elif isinstance(value, str):
+            try:
+                # Attempt to parse a common ISO 8601 format.
+                # The .replace() handles the 'Z' for Zulu/UTC time.
+                dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+                # Now that we have a datetime object, format it as desired
+                new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+            except (ValueError, TypeError):
+                # If it fails to parse, it's just a regular string
+                new_item[key] = value
+        # Handle all other types
+        else:
+            new_item[key] = value
+    return new_item
+# Paginated scan
+def scan_table():
+    items = []
+    response = table.scan()
+    items.extend(response["Items"])
+    while "LastEvaluatedKey" in response:
+        response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
+        items.extend(response["Items"])
+    return items
+# Export to CSV
+# Export to CSV
+def export_to_csv(items, output_path, fields_to_drop: list = None):
+    if not items:
+        print("No items found.")
+        return
+    # Use a set for efficient lookup
+    drop_set = set(fields_to_drop or [])
+    # Get a comprehensive list of all possible headers from all items
+    all_keys = set()
+    for item in items:
+        all_keys.update(item.keys())
+    # Determine the final fieldnames by subtracting the ones to drop
+    fieldnames = sorted(list(all_keys - drop_set))
+    print("Final CSV columns will be:", fieldnames)
+    with open(output_path, "w", newline="", encoding="utf-8-sig") as csvfile:
+        # The key fix is here: extrasaction='ignore'
+        # restval='' is also good practice to handle rows that are missing a key
+        writer = csv.DictWriter(
+            csvfile, fieldnames=fieldnames, extrasaction="ignore", restval=""
+        )
+        writer.writeheader()
+        for item in items:
+            # The convert_types function can now return the full dict,
+            # and the writer will simply ignore the extra fields.
+            writer.writerow(convert_types(item))
+    print(f"Exported {len(items)} items to {output_path}")
+# Run export
+items = scan_table()
+export_to_csv(
+    items,
+    CSV_OUTPUT,
+    fields_to_drop=["Query metadata - usage counts and other parameters"],
+)

load_s3_logs.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from datetime import datetime
+from io import StringIO
+import boto3
+import pandas as pd
+from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    DOCUMENT_REDACTION_BUCKET,
+    OUTPUT_FOLDER,
+)
+# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
+# S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
+if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY,
+        aws_secret_access_key=AWS_SECRET_KEY,
+        region_name=AWS_REGION,
+    )
+else:
+    s3 = boto3.client("s3")
+bucket_name = DOCUMENT_REDACTION_BUCKET
+prefix = "usage/"  # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
+earliest_date = "20250409"  # Earliest date of logs folder retrieved
+latest_date = "20250423"  # Latest date of logs folder retrieved
+# Function to list all files in a folder
+def list_files_in_s3(bucket, prefix):
+    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
+    if "Contents" in response:
+        return [content["Key"] for content in response["Contents"]]
+    return []
+# Function to filter date range
+def is_within_date_range(date_str, start_date, end_date):
+    date_obj = datetime.strptime(date_str, "%Y%m%d")
+    return start_date <= date_obj <= end_date
+# Define the date range
+start_date = datetime.strptime(earliest_date, "%Y%m%d")  # Replace with your start date
+end_date = datetime.strptime(latest_date, "%Y%m%d")  # Replace with your end date
+# List all subfolders under 'usage/'
+all_files = list_files_in_s3(bucket_name, prefix)
+# Filter based on date range
+log_files = []
+for file in all_files:
+    parts = file.split("/")
+    if len(parts) >= 3:
+        date_str = parts[1]
+        if (
+            is_within_date_range(date_str, start_date, end_date)
+            and parts[-1] == "log.csv"
+        ):
+            log_files.append(file)
+# Download, read and concatenate CSV files into a pandas DataFrame
+df_list = []
+for log_file in log_files:
+    # Download the file
+    obj = s3.get_object(Bucket=bucket_name, Key=log_file)
+    try:
+        csv_content = (
+            obj["Body"].read().decode("utf-8")
+        )  # Suggest trying latin-1 instead of utf-8 if this fails
+    except Exception as e:
+        print("Could not load in log file:", log_file, "due to:", e)
+        continue
+    # Read CSV content into pandas DataFrame
+    df = pd.read_csv(StringIO(csv_content))
+    df_list.append(df)
+# Concatenate all DataFrames
+if df_list:
+    concatenated_df = pd.concat(df_list, ignore_index=True)
+    # Save the concatenated DataFrame to a CSV file
+    concatenated_df.to_csv(OUTPUT_FOLDER + "consolidated_s3_logs.csv", index=False)
+    print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
+else:
+    print("No log files found in the given date range.")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,147 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "llm_topic_modelling"
+version = "0.6.0"
+description = "Generate thematic summaries from open text in tabular data files with a large language model."
+requires-python = ">=3.10"
+readme = "README.md"
+authors = [
+    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
+]
+maintainers = [
+    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
+]
+keywords = [
+    "topic-modelling",
+    "topic-modeling",
+    "llm",
+    "large-language-models",
+    "thematic-analysis",
+    "text-analysis",
+    "nlp",
+    "natural-language-processing",
+    "text-summarization",
+    "text-summarisation",
+    "thematic-summaries",
+    "gradio",
+    "data-analysis",
+    "tabular-data",
+    "excel",
+    "csv",
+    "open-text",
+    "text-mining"
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Information Technology",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Text Processing :: Linguistic",
+    "Topic :: Text Processing :: Markup",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "gradio==6.0.2",
+    "transformers==4.57.2",
+    "spaces==0.42.1",
+    "boto3==1.42.1",
+    "pandas<=2.3.3",
+    "pyarrow>=21.0.0",
+    "openpyxl>=3.1.5",
+    "markdown>=3.7",
+    "tabulate>=0.9.0",
+    "lxml>=5.3.0",
+    "google-genai<=1.52.0",
+    "openai<=2.8.1",
+    "html5lib>=1.1",
+    "beautifulsoup4>=4.12.3",
+    "rapidfuzz>=3.13.0",
+    "python-dotenv>=1.1.0"
+]
+[project.optional-dependencies]
+dev = ["pytest"]
+test = ["pytest", "pytest-cov"]
+# Extra dependencies for VLM models
+# For torch you should use --index-url https://download.pytorch.org/whl/cu128. Additionally installs the unsloth package
+torch = [
+    "torch<=2.9.1",
+    "torchvision",
+    "accelerate",
+    "bitsandbytes",
+    "unsloth==2025.11.6",
+    "unsloth_zoo==2025.11.6",
+    "timm",
+    "xformers"
+]
+# If you want to install llama-cpp-python in GPU mode, use cmake.args="-DGGML_CUDA=on" . If that doesn't work, try specific wheels for your system, e.g. for Linux see files in https://github.com/JamePeng/llama-cpp-python/releases. More details on installation here: https://llama-cpp-python.readthedocs.io/en/latest
+llamacpp = [
+    "llama-cpp-python>=0.3.16",
+]
+# Run Gradio as an mcp server
+mcp = [
+    "gradio[mcp]==6.0.2"
+]
+[project.urls]
+Homepage = "https://github.com/seanpedrick-case/llm_topic_modelling"
+repository = "https://github.com/seanpedrick-case/llm_topic_modelling"
+[tool.setuptools]
+packages = ["tools"]
+py-modules = ["app"]
+# Configuration for Ruff linter:
+[tool.ruff]
+line-length = 88
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+ignore = [
+    "E501",  # line-too-long (handled with Black)
+    "E402",  # module-import-not-at-top-of-file (sometimes needed for conditional imports)
+]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]  # Allow unused imports in __init__.py
+# Configuration for a Black formatter:
+[tool.black]
+line-length = 88
+target-version = ['py310']
+# Configuration for pytest:
+[tool.pytest.ini_options]
+filterwarnings = [
+    "ignore::DeprecationWarning:click.parser",
+    "ignore::DeprecationWarning:weasel.util.config",
+    "ignore::DeprecationWarning:builtin type",
+    "ignore::DeprecationWarning:websockets.legacy",
+    "ignore::DeprecationWarning:websockets.server",
+    "ignore::DeprecationWarning:spacy.cli._util",
+    "ignore::DeprecationWarning:weasel.util.config",
+    "ignore::DeprecationWarning:importlib._bootstrap",
+]
+testpaths = ["test"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--tb=short",
+    "--strict-markers",
+    "--disable-warnings",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_no_local.txt for installation without local model inference (simplest approach to get going). Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
+gradio==6.0.2
+transformers==4.57.2
+spaces==0.42.1
+boto3>=1.42.1
+pandas>=2.3.3
+pyarrow>=21.0.0
+openpyxl>=3.1.5
+markdown>=3.7
+tabulate>=0.9.0
+lxml>=5.3.0
+google-genai>=1.52.0
+openai>=2.8.1
+html5lib>=1.1
+beautifulsoup4>=4.12.3
+rapidfuzz>=3.13.0
+python-dotenv>=1.1.0
+# GPU (for huggingface instance)
+# Torch/Unsloth and llama-cpp-python
+# Latest compatible with CUDA 12.4
+torch<=2.9.1 --extra-index-url https://download.pytorch.org/whl/cu128
+unsloth[cu128-torch280]<=2025.11.6
+unsloth_zoo<=2025.11.6
+timm
+# llama-cpp-python direct wheel link for GPU compatible version 3.17 for use with Python 3.10 and Hugging Face
+https://github.com/JamePeng/llama-cpp-python/releases/download/v0.3.17-cu128-Basic-linux-20251202/llama_cpp_python-0.3.17-cp310-cp310-linux_x86_64.whl
+#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl

requirements_cpu.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+gradio==6.0.2
+transformers==4.57.2
+spaces==0.42.1
+pandas>=2.3.3
+boto3>=1.42.1
+pyarrow>=21.0.0
+openpyxl>=3.1.5
+markdown>=3.7
+tabulate>=0.9.0
+lxml>=5.3.0
+google-genai>=1.52.0
+openai>=2.8.1
+html5lib>=1.1
+beautifulsoup4>=4.12.3
+rapidfuzz>=3.13.0
+python-dotenv>=1.1.0
+torch<=2.9.1 --extra-index-url https://download.pytorch.org/whl/cpu
+llama-cpp-python==0.3.16 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
+# Direct wheel links if above doesn't work
+# I have created CPU Linux, Python 3.11 compatible wheels:
+# https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
+# Windows, Python 3.11 compatible CPU wheels available:
+# https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64_cpu_openblas.whl
+# If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source

requirements_gpu.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+gradio==6.0.2
+transformers==4.57.2
+spaces==0.42.1
+boto3>=1.42.1
+pandas>=2.3.3
+pyarrow>=21.0.0
+openpyxl>=3.1.5
+markdown>=3.7
+tabulate>=0.9.0
+lxml>=5.3.0
+google-genai>=1.52.0
+openai>=2.8.1
+html5lib>=1.1
+beautifulsoup4>=4.12.3
+rapidfuzz>=3.13.0
+python-dotenv>=1.1.0
+# Torch/Unsloth
+# Latest compatible with CUDA 12.4
+torch<=2.9.1 --extra-index-url https://download.pytorch.org/whl/cu128
+unsloth[cu128-torch280]<=2025.11.6 # Refer here for more details on installation: https://pypi.org/project/unsloth
+unsloth_zoo<=2025.11.6
+# Additional for Windows and CUDA 12.4 older GPUS (RTX 3x or similar):
+#triton-windows<3.3
+timm
+# Llama CPP Python
+llama-cpp-python>=0.3.16 -C cmake.args="-DGGML_CUDA=on"
+# If above doesn't work, try specific wheels for your system, see files in https://github.com/JamePeng/llama-cpp-python/releases for different python versions

requirements_lightweight.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
+gradio==6.0.2
+transformers==4.57.2
+spaces==0.42.1
+boto3>=1.42.1
+pandas>=2.3.3
+pyarrow>=21.0.0
+openpyxl>=3.1.5
+markdown>=3.7
+tabulate>=0.9.0
+lxml>=5.3.0
+google-genai>=1.52.0
+openai>=2.8.1
+html5lib>=1.1
+beautifulsoup4>=4.12.3
+rapidfuzz>=3.13.0
+python-dotenv>=1.1.0
+awslambdaric==3.1.1

test/README.md ADDED Viewed

	@@ -0,0 +1,87 @@

+# Test Suite for LLM Topic Modeller
+This test suite provides comprehensive testing for the CLI interface (`cli_topics.py`) and GUI application (`app.py`).
+## Overview
+The test suite includes:
+- **CLI Tests**: Tests based on examples from the `cli_topics.py` epilog
+- **GUI Tests**: Tests to verify the Gradio interface loads correctly
+- **Mock Inference Server**: A dummy inference-server endpoint that avoids API costs during testing
+## Structure
+- `test.py`: Main test suite with CLI tests
+- `test_gui_only.py`: GUI-specific tests
+- `mock_inference_server.py`: Mock HTTP server that mimics an inference-server API
+- `run_tests.py`: Test runner script
+- `__init__.py`: Package initialization
+## Running Tests
+### Run All Tests
+From the project root directory:
+```bash
+python test/run_tests.py
+```
+Or from the test directory:
+```bash
+python run_tests.py
+```
+### Run Only CLI Tests
+```bash
+python -m unittest test.test.TestCLITopicsExamples
+```
+### Run Only GUI Tests
+```bash
+python test/test_gui_only.py
+```
+## Mock Inference Server
+The test suite uses a mock inference server to avoid API costs during testing. The mock server:
+- Listens on `localhost:8080` by default
+- Responds to `/v1/chat/completions` endpoint
+- Returns valid markdown table responses that satisfy validation requirements
+- Provides token counts for usage tracking
+The mock server is automatically started before tests and stopped after tests complete.
+## Test Coverage
+The CLI tests cover:
+1. **Topic Extraction**
+   - Default settings
+   - Custom model and context
+   - Grouping by column
+   - Zero-shot extraction with candidate topics
+2. **Topic Deduplication**
+   - Fuzzy matching
+   - LLM-based deduplication
+3. **All-in-One Pipeline**
+   - Complete workflow (extract, deduplicate, summarise)
+## Requirements
+- Python 3.7+
+- All dependencies from `requirements.txt`
+- Example data files in `example_data/` directory
+## Notes
+- Tests will be skipped if required example files are not found
+- The mock server must be running for CLI tests to work
+- Tests use temporary output directories that are cleaned up after execution

test/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Test suite for LLM Topic Modeller CLI.
+This package contains tests for the CLI interface and GUI application.
+"""

test/mock_inference_server.py ADDED Viewed

	@@ -0,0 +1,225 @@

+#!/usr/bin/env python3
+"""
+Mock inference server for testing CLI topic extraction without API costs.
+This server mimics an inference-server API endpoint and returns dummy
+responses that satisfy the validation requirements (markdown tables with |).
+"""
+import json
+import threading
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Optional
+class MockInferenceServerHandler(BaseHTTPRequestHandler):
+    """HTTP request handler for the mock inference server."""
+    def _generate_mock_response(self, prompt: str, system_prompt: str) -> str:
+        """
+        Generate a mock response that satisfies validation requirements.
+        The response must:
+        - Be longer than 120 characters
+        - Contain a markdown table (with | characters)
+        Args:
+            prompt: The user prompt
+            system_prompt: The system prompt
+        Returns:
+            A mock markdown table response
+        """
+        # Generate a simple markdown table that satisfies the validation
+        # This mimics a topic extraction table response
+        mock_table = """| Reference | General Topic | Sub-topic | Sentiment |
+|-----------|---------------|-----------|-----------|
+| 1 | Test Topic | Test Subtopic | Positive |
+| 2 | Another Topic | Another Subtopic | Neutral |
+| 3 | Third Topic | Third Subtopic | Negative |
+This is a mock response from the test inference server. The actual content would be generated by a real LLM model, but for testing purposes, this dummy response allows us to verify that the CLI commands work correctly without incurring API costs."""
+        return mock_table
+    def _estimate_tokens(self, text: str) -> int:
+        """Estimate token count (rough approximation: ~4 characters per token)."""
+        return max(1, len(text) // 4)
+    def do_POST(self):
+        """Handle POST requests to /v1/chat/completions."""
+        print(f"[Mock Server] Received POST request to: {self.path}")
+        if self.path == "/v1/chat/completions":
+            try:
+                # Read request body
+                content_length = int(self.headers.get("Content-Length", 0))
+                print(f"[Mock Server] Content-Length: {content_length}")
+                body = self.rfile.read(content_length)
+                payload = json.loads(body.decode("utf-8"))
+                print("[Mock Server] Payload received, processing...")
+                # Extract messages
+                messages = payload.get("messages", [])
+                system_prompt = ""
+                user_prompt = ""
+                for msg in messages:
+                    role = msg.get("role", "")
+                    content = msg.get("content", "")
+                    if role == "system":
+                        system_prompt = content
+                    elif role == "user":
+                        user_prompt = content
+                # Generate mock response
+                response_text = self._generate_mock_response(user_prompt, system_prompt)
+                # Estimate tokens
+                input_tokens = self._estimate_tokens(system_prompt + "\n" + user_prompt)
+                output_tokens = self._estimate_tokens(response_text)
+                # Check if streaming is requested
+                stream = payload.get("stream", False)
+                if stream:
+                    # Handle streaming response
+                    self.send_response(200)
+                    self.send_header("Content-Type", "text/event-stream")
+                    self.send_header("Cache-Control", "no-cache")
+                    self.send_header("Connection", "keep-alive")
+                    self.end_headers()
+                    # Send streaming chunks
+                    chunk_size = 20  # Characters per chunk
+                    for i in range(0, len(response_text), chunk_size):
+                        chunk = response_text[i : i + chunk_size]
+                        chunk_data = {
+                            "choices": [
+                                {
+                                    "delta": {"content": chunk},
+                                    "index": 0,
+                                    "finish_reason": None,
+                                }
+                            ]
+                        }
+                        self.wfile.write(f"data: {json.dumps(chunk_data)}\n\n".encode())
+                        self.wfile.flush()
+                    # Send final done message
+                    self.wfile.write(b"data: [DONE]\n\n")
+                    self.wfile.flush()
+                else:
+                    # Handle non-streaming response
+                    response_data = {
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": "stop",
+                                "message": {
+                                    "role": "assistant",
+                                    "content": response_text,
+                                },
+                            }
+                        ],
+                        "usage": {
+                            "prompt_tokens": input_tokens,
+                            "completion_tokens": output_tokens,
+                            "total_tokens": input_tokens + output_tokens,
+                        },
+                    }
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(response_data).encode())
+            except Exception as e:
+                self.send_response(500)
+                self.send_header("Content-Type", "application/json")
+                self.end_headers()
+                error_response = {"error": {"message": str(e), "type": "server_error"}}
+                self.wfile.write(json.dumps(error_response).encode())
+        else:
+            self.send_response(404)
+            self.end_headers()
+    def log_message(self, format, *args):
+        """Log messages for debugging."""
+        # Enable logging for debugging
+        print(f"[Mock Server] {format % args}")
+class MockInferenceServer:
+    """Mock inference server that can be started and stopped for testing."""
+    def __init__(self, host: str = "localhost", port: int = 8080):
+        """
+        Initialize the mock server.
+        Args:
+            host: Host to bind to (default: localhost)
+            port: Port to bind to (default: 8080)
+        """
+        self.host = host
+        self.port = port
+        self.server: Optional[HTTPServer] = None
+        self.server_thread: Optional[threading.Thread] = None
+        self.running = False
+    def start(self):
+        """Start the mock server in a separate thread."""
+        if self.running:
+            return
+        def run_server():
+            self.server = HTTPServer((self.host, self.port), MockInferenceServerHandler)
+            self.running = True
+            self.server.serve_forever()
+        self.server_thread = threading.Thread(target=run_server, daemon=True)
+        self.server_thread.start()
+        # Wait a moment for server to start
+        import time
+        time.sleep(0.5)
+    def stop(self):
+        """Stop the mock server."""
+        if self.server and self.running:
+            self.server.shutdown()
+            self.server.server_close()
+            self.running = False
+    def get_url(self) -> str:
+        """Get the server URL."""
+        return f"http://{self.host}:{self.port}"
+    def __enter__(self):
+        """Context manager entry."""
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.stop()
+if __name__ == "__main__":
+    # Test the server
+    print("Starting mock inference server on http://localhost:8080")
+    print("Press Ctrl+C to stop")
+    server = MockInferenceServer()
+    try:
+        server.start()
+        print(f"Server running at {server.get_url()}")
+        # Keep running
+        while True:
+            import time
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\nStopping server...")
+        server.stop()
+        print("Server stopped")

test/mock_llm_calls.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+"""
+Mock LLM function calls for testing CLI topic extraction without API costs.
+This module patches requests.post to intercept HTTP calls to inference servers
+and return mock responses instead.
+"""
+import json
+import os
+# Store original requests if it exists
+_original_requests = None
+def _generate_mock_response(prompt: str, system_prompt: str) -> str:
+    """
+    Generate a mock response that satisfies validation requirements.
+    The response must:
+    - Be longer than 120 characters
+    - Contain a markdown table (with | characters)
+    Args:
+        prompt: The user prompt
+        system_prompt: The system prompt
+    Returns:
+        A mock markdown table response
+    """
+    # Generate a simple markdown table that satisfies the validation
+    # This mimics a topic extraction table response
+    mock_table = """| Reference | General Topic | Sub-topic | Sentiment |
+|-----------|---------------|-----------|-----------|
+| 1 | Test Topic | Test Subtopic | Positive |
+| 2 | Another Topic | Another Subtopic | Neutral |
+| 3 | Third Topic | Third Subtopic | Negative |
+This is a mock response from the test inference server. The actual content would be generated by a real LLM model, but for testing purposes, this dummy response allows us to verify that the CLI commands work correctly without incurring API costs."""
+    return mock_table
+def _estimate_tokens(text: str) -> int:
+    """Estimate token count (rough approximation: ~4 characters per token)."""
+    return max(1, len(text) // 4)
+def mock_requests_post(url, **kwargs):
+    """
+    Mock version of requests.post that intercepts inference-server calls.
+    Returns a mock response object that mimics the real requests.Response.
+    """
+    # Only mock inference-server URLs
+    if "/v1/chat/completions" not in url:
+        # For non-inference-server URLs, use real requests
+        import requests
+        return requests.post(url, **kwargs)
+    # Extract payload
+    payload = kwargs.get("json", {})
+    messages = payload.get("messages", [])
+    # Extract prompts
+    system_prompt = ""
+    user_prompt = ""
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if role == "system":
+            system_prompt = content
+        elif role == "user":
+            user_prompt = content
+    # Generate mock response
+    response_text = _generate_mock_response(user_prompt, system_prompt)
+    # Estimate tokens
+    input_tokens = _estimate_tokens(system_prompt + "\n" + user_prompt)
+    output_tokens = _estimate_tokens(response_text)
+    # Check if streaming is requested
+    stream = payload.get("stream", False)
+    if stream:
+        # For streaming, create a mock response with iter_lines
+        class MockStreamResponse:
+            def __init__(self, text):
+                self.text = text
+                self.status_code = 200
+                self.lines = []
+                # Simulate streaming chunks
+                chunk_size = 20
+                for i in range(0, len(text), chunk_size):
+                    chunk = text[i : i + chunk_size]
+                    chunk_data = {
+                        "choices": [
+                            {
+                                "delta": {"content": chunk},
+                                "index": 0,
+                                "finish_reason": None,
+                            }
+                        ]
+                    }
+                    self.lines.append(f"data: {json.dumps(chunk_data)}\n\n".encode())
+                self.lines.append(b"data: [DONE]\n\n")
+                self._line_index = 0
+            def raise_for_status(self):
+                pass
+            def iter_lines(self):
+                for line in self.lines:
+                    yield line
+        return MockStreamResponse(response_text)
+    else:
+        # For non-streaming, create a simple mock response
+        class MockResponse:
+            def __init__(self, text, input_tokens, output_tokens):
+                self._json_data = {
+                    "choices": [
+                        {
+                            "index": 0,
+                            "finish_reason": "stop",
+                            "message": {
+                                "role": "assistant",
+                                "content": text,
+                            },
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": input_tokens,
+                        "completion_tokens": output_tokens,
+                        "total_tokens": input_tokens + output_tokens,
+                    },
+                }
+                self.status_code = 200
+            def raise_for_status(self):
+                pass
+            def json(self):
+                return self._json_data
+        return MockResponse(response_text, input_tokens, output_tokens)
+def apply_mock_patches():
+    """
+    Apply patches to mock HTTP requests.
+    This should be called before importing modules that use requests.
+    """
+    global _original_requests
+    try:
+        import requests
+        _original_requests = requests.post
+        requests.post = mock_requests_post
+        print("[Mock] Patched requests.post for inference-server calls")
+    except ImportError:
+        # requests not imported yet, will be patched when imported
+        pass
+def restore_original():
+    """Restore original requests.post if it was patched."""
+    global _original_requests
+    if _original_requests:
+        try:
+            import requests
+            requests.post = _original_requests
+            _original_requests = None
+            print("[Mock] Restored original requests.post")
+        except ImportError:
+            pass
+# Auto-apply patches if TEST_MODE environment variable is set
+if os.environ.get("TEST_MODE") == "1" or os.environ.get("USE_MOCK_LLM") == "1":
+    apply_mock_patches()

test/run_tests.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env python3
+"""
+Simple script to run the CLI topics test suite.
+This script demonstrates how to run the comprehensive test suite
+that covers all the examples from the CLI epilog.
+"""
+import os
+import sys
+# Add the parent directory to the path so we can import the test module
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, parent_dir)
+# Import test functions
+from test.test import run_all_tests
+if __name__ == "__main__":
+    print("Starting LLM Topic Modeller Test Suite...")
+    print("This will test:")
+    print("- CLI examples from the epilog")
+    print("- GUI application functionality")
+    print("Using a mock inference-server to avoid API costs.")
+    print("=" * 60)
+    success = run_all_tests()
+    if success:
+        print("\n🎉 All tests passed successfully!")
+        sys.exit(0)
+    else:
+        print("\n❌ Some tests failed. Check the output above for details.")
+        sys.exit(1)

test/test.py ADDED Viewed

	@@ -0,0 +1,1067 @@

+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import unittest
+from typing import List, Optional
+# Mock LLM calls are automatically applied via environment variables
+# No need to import - the mock patches are applied when USE_MOCK_LLM=1 is set
+def run_cli_topics(
+    script_path: str,
+    task: str,
+    output_dir: str,
+    input_file: Optional[str] = None,
+    text_column: Optional[str] = None,
+    previous_output_files: Optional[List[str]] = None,
+    timeout: int = 600,  # 10-minute timeout
+    # General Arguments
+    username: Optional[str] = None,
+    save_to_user_folders: Optional[bool] = None,
+    excel_sheets: Optional[List[str]] = None,
+    group_by: Optional[str] = None,
+    # Model Configuration
+    model_choice: Optional[str] = None,
+    temperature: Optional[float] = None,
+    batch_size: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    api_url: Optional[str] = None,
+    inference_server_model: Optional[str] = None,
+    # Topic Extraction Arguments
+    context: Optional[str] = None,
+    candidate_topics: Optional[str] = None,
+    force_zero_shot: Optional[str] = None,
+    force_single_topic: Optional[str] = None,
+    produce_structured_summary: Optional[str] = None,
+    sentiment: Optional[str] = None,
+    additional_summary_instructions: Optional[str] = None,
+    # Validation Arguments
+    additional_validation_issues: Optional[str] = None,
+    show_previous_table: Optional[str] = None,
+    output_debug_files: Optional[str] = None,
+    max_time_for_loop: Optional[int] = None,
+    # Deduplication Arguments
+    method: Optional[str] = None,
+    similarity_threshold: Optional[int] = None,
+    merge_sentiment: Optional[str] = None,
+    merge_general_topics: Optional[str] = None,
+    # Summarisation Arguments
+    summary_format: Optional[str] = None,
+    sample_reference_table: Optional[str] = None,
+    no_of_sampled_summaries: Optional[int] = None,
+    random_seed: Optional[int] = None,
+    # Output Format Arguments
+    create_xlsx_output: Optional[bool] = None,
+    # Logging Arguments
+    save_logs_to_csv: Optional[bool] = None,
+    save_logs_to_dynamodb: Optional[bool] = None,
+    cost_code: Optional[str] = None,
+) -> bool:
+    """
+    Executes the cli_topics.py script with specified arguments using a subprocess.
+    Args:
+        script_path (str): The path to the cli_topics.py script.
+        task (str): The main task to perform ('extract', 'validate', 'deduplicate', 'summarise', 'overall_summary', or 'all_in_one').
+        output_dir (str): The path to the directory for output files.
+        input_file (str, optional): Path to the input file to process.
+        text_column (str, optional): Name of the text column to process.
+        previous_output_files (List[str], optional): Path(s) to previous output files.
+        timeout (int): Timeout in seconds for the subprocess.
+        All other arguments match the CLI arguments from cli_topics.py.
+    Returns:
+        bool: True if the script executed successfully, False otherwise.
+    """
+    # 1. Get absolute paths and perform pre-checks
+    script_abs_path = os.path.abspath(script_path)
+    output_abs_dir = os.path.abspath(output_dir)
+    # Handle input file based on task
+    if task in ["extract", "validate", "all_in_one"] and input_file is None:
+        raise ValueError(f"Input file is required for '{task}' task")
+    if input_file:
+        input_abs_path = os.path.abspath(input_file)
+        if not os.path.isfile(input_abs_path):
+            raise FileNotFoundError(f"Input file not found: {input_abs_path}")
+    if not os.path.isfile(script_abs_path):
+        raise FileNotFoundError(f"Script not found: {script_abs_path}")
+    if not os.path.isdir(output_abs_dir):
+        # Create the output directory if it doesn't exist
+        print(f"Output directory not found. Creating: {output_abs_dir}")
+        os.makedirs(output_abs_dir)
+    script_folder = os.path.dirname(script_abs_path)
+    # 2. Dynamically build the command list
+    command = [
+        "python",
+        script_abs_path,
+        "--output_dir",
+        output_abs_dir,
+        "--task",
+        task,
+    ]
+    # Add input_file only if it's not None
+    if input_file:
+        command.extend(["--input_file", input_abs_path])
+    # Add general arguments
+    if text_column:
+        command.extend(["--text_column", text_column])
+    if previous_output_files:
+        command.extend(["--previous_output_files"] + previous_output_files)
+    if username:
+        command.extend(["--username", username])
+    if save_to_user_folders is not None:
+        command.extend(["--save_to_user_folders", str(save_to_user_folders)])
+    if excel_sheets:
+        command.append("--excel_sheets")
+        command.extend(excel_sheets)
+    if group_by:
+        command.extend(["--group_by", group_by])
+    # Add model configuration arguments
+    if model_choice:
+        command.extend(["--model_choice", model_choice])
+    if temperature is not None:
+        command.extend(["--temperature", str(temperature)])
+    if batch_size is not None:
+        command.extend(["--batch_size", str(batch_size)])
+    if max_tokens is not None:
+        command.extend(["--max_tokens", str(max_tokens)])
+    if api_url:
+        command.extend(["--api_url", api_url])
+    if inference_server_model:
+        command.extend(["--inference_server_model", inference_server_model])
+    # Add topic extraction arguments
+    if context:
+        command.extend(["--context", context])
+    if candidate_topics:
+        command.extend(["--candidate_topics", candidate_topics])
+    if force_zero_shot:
+        command.extend(["--force_zero_shot", force_zero_shot])
+    if force_single_topic:
+        command.extend(["--force_single_topic", force_single_topic])
+    if produce_structured_summary:
+        command.extend(["--produce_structured_summary", produce_structured_summary])
+    if sentiment:
+        command.extend(["--sentiment", sentiment])
+    if additional_summary_instructions:
+        command.extend(
+            ["--additional_summary_instructions", additional_summary_instructions]
+        )
+    # Add validation arguments
+    if additional_validation_issues:
+        command.extend(["--additional_validation_issues", additional_validation_issues])
+    if show_previous_table:
+        command.extend(["--show_previous_table", show_previous_table])
+    if output_debug_files:
+        command.extend(["--output_debug_files", output_debug_files])
+    if max_time_for_loop is not None:
+        command.extend(["--max_time_for_loop", str(max_time_for_loop)])
+    # Add deduplication arguments
+    if method:
+        command.extend(["--method", method])
+    if similarity_threshold is not None:
+        command.extend(["--similarity_threshold", str(similarity_threshold)])
+    if merge_sentiment:
+        command.extend(["--merge_sentiment", merge_sentiment])
+    if merge_general_topics:
+        command.extend(["--merge_general_topics", merge_general_topics])
+    # Add summarisation arguments
+    if summary_format:
+        command.extend(["--summary_format", summary_format])
+    if sample_reference_table:
+        command.extend(["--sample_reference_table", sample_reference_table])
+    if no_of_sampled_summaries is not None:
+        command.extend(["--no_of_sampled_summaries", str(no_of_sampled_summaries)])
+    if random_seed is not None:
+        command.extend(["--random_seed", str(random_seed)])
+    # Add output format arguments
+    if create_xlsx_output is False:
+        command.append("--no_xlsx_output")
+    # Add logging arguments
+    if save_logs_to_csv is not None:
+        command.extend(["--save_logs_to_csv", str(save_logs_to_csv)])
+    if save_logs_to_dynamodb is not None:
+        command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
+    if cost_code:
+        command.extend(["--cost_code", cost_code])
+    # Filter out None values before joining
+    command_str = " ".join(str(arg) for arg in command if arg is not None)
+    print(f"Executing command: {command_str}")
+    # 3. Execute the command using subprocess
+    try:
+        # Use unbuffered output to avoid hanging
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        # Ensure inference server is enabled for testing
+        env["RUN_INFERENCE_SERVER"] = "1"
+        # Enable mock mode
+        env["USE_MOCK_LLM"] = "1"
+        env["TEST_MODE"] = "1"
+        result = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Combine stderr with stdout to avoid deadlocks
+            text=True,
+            cwd=script_folder,  # Important for relative paths within the script
+            env=env,
+            bufsize=0,  # Unbuffered
+        )
+        # Read output in real-time to avoid deadlocks
+        start_time = time.time()
+        # For Windows, we need a different approach
+        if sys.platform == "win32":
+            # On Windows, use communicate with timeout
+            try:
+                stdout, stderr = result.communicate(timeout=timeout)
+            except subprocess.TimeoutExpired:
+                result.kill()
+                stdout, stderr = result.communicate()
+                raise subprocess.TimeoutExpired(result.args, timeout)
+        else:
+            # On Unix, we can use select for real-time reading
+            import select
+            stdout_lines = []
+            while result.poll() is None:
+                ready, _, _ = select.select([result.stdout], [], [], 0.1)
+                if ready:
+                    line = result.stdout.readline()
+                    if line:
+                        print(line.rstrip(), flush=True)
+                        stdout_lines.append(line)
+                # Check timeout
+                if time.time() - start_time > timeout:
+                    result.kill()
+                    raise subprocess.TimeoutExpired(result.args, timeout)
+            # Read remaining output
+            remaining = result.stdout.read()
+            if remaining:
+                print(remaining, end="", flush=True)
+                stdout_lines.append(remaining)
+            stdout = "".join(stdout_lines)
+            stderr = ""  # Combined with stdout
+        print("--- SCRIPT STDOUT ---")
+        if stdout:
+            print(stdout)
+        print("--- SCRIPT STDERR ---")
+        if stderr:
+            print(stderr)
+        print("---------------------")
+        # Analyze the output for errors and success indicators
+        analysis = analyze_test_output(stdout, stderr)
+        if analysis["has_errors"]:
+            print("❌ Errors detected in output:")
+            for i, error_type in enumerate(analysis["error_types"]):
+                print(f"   {i+1}. {error_type}")
+            if analysis["error_messages"]:
+                print("   Error messages:")
+                for msg in analysis["error_messages"][
+                    :3
+                ]:  # Show first 3 error messages
+                    print(f"     - {msg}")
+            return False
+        elif result.returncode == 0:
+            success_msg = "✅ Script executed successfully."
+            if analysis["success_indicators"]:
+                success_msg += f" (Success indicators: {', '.join(analysis['success_indicators'][:3])})"
+            print(success_msg)
+            return True
+        else:
+            print(f"❌ Command failed with return code {result.returncode}")
+            return False
+    except subprocess.TimeoutExpired:
+        result.kill()
+        print(f"❌ Subprocess timed out after {timeout} seconds.")
+        return False
+    except Exception as e:
+        print(f"❌ An unexpected error occurred: {e}")
+        return False
+def analyze_test_output(stdout: str, stderr: str) -> dict:
+    """
+    Analyze test output to provide detailed error information.
+    Args:
+        stdout (str): Standard output from the test
+        stderr (str): Standard error from the test
+    Returns:
+        dict: Analysis results with error details
+    """
+    combined_output = (stdout or "") + (stderr or "")
+    analysis = {
+        "has_errors": False,
+        "error_types": [],
+        "error_messages": [],
+        "success_indicators": [],
+        "warning_indicators": [],
+    }
+    # Error patterns
+    error_patterns = {
+        "An error occurred": "General error message",
+        "Error:": "Error prefix",
+        "Exception:": "Exception occurred",
+        "Traceback": "Python traceback",
+        "Failed to": "Operation failure",
+        "Cannot": "Operation not possible",
+        "Unable to": "Operation not possible",
+        "KeyError:": "Missing key/dictionary error",
+        "AttributeError:": "Missing attribute error",
+        "TypeError:": "Type mismatch error",
+        "ValueError:": "Invalid value error",
+        "FileNotFoundError:": "File not found",
+        "ImportError:": "Import failure",
+        "ModuleNotFoundError:": "Module not found",
+    }
+    # Success indicators
+    success_patterns = [
+        "Successfully",
+        "Completed",
+        "Finished",
+        "Processed",
+        "Complete",
+        "Output files saved",
+    ]
+    # Warning indicators
+    warning_patterns = ["Warning:", "WARNING:", "Deprecated", "DeprecationWarning"]
+    # Check for errors
+    for pattern, description in error_patterns.items():
+        if pattern.lower() in combined_output.lower():
+            analysis["has_errors"] = True
+            analysis["error_types"].append(description)
+            # Extract the actual error message
+            lines = combined_output.split("\n")
+            for line in lines:
+                if pattern.lower() in line.lower():
+                    analysis["error_messages"].append(line.strip())
+    # Check for success indicators
+    for pattern in success_patterns:
+        if pattern.lower() in combined_output.lower():
+            analysis["success_indicators"].append(pattern)
+    # Check for warnings
+    for pattern in warning_patterns:
+        if pattern.lower() in combined_output.lower():
+            analysis["warning_indicators"].append(pattern)
+    return analysis
+def run_app_direct_mode(
+    app_path: str,
+    task: str,
+    output_dir: str,
+    input_file: Optional[str] = None,
+    text_column: Optional[str] = None,
+    previous_output_files: Optional[List[str]] = None,
+    timeout: int = 600,
+    # General Arguments
+    username: Optional[str] = None,
+    save_to_user_folders: Optional[bool] = None,
+    excel_sheets: Optional[List[str]] = None,
+    group_by: Optional[str] = None,
+    # Model Configuration
+    model_choice: Optional[str] = None,
+    temperature: Optional[float] = None,
+    batch_size: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    api_url: Optional[str] = None,
+    inference_server_model: Optional[str] = None,
+    # Topic Extraction Arguments
+    context: Optional[str] = None,
+    candidate_topics: Optional[str] = None,
+    force_zero_shot: Optional[str] = None,
+    force_single_topic: Optional[str] = None,
+    produce_structured_summary: Optional[str] = None,
+    sentiment: Optional[str] = None,
+    additional_summary_instructions: Optional[str] = None,
+    # Validation Arguments
+    additional_validation_issues: Optional[str] = None,
+    show_previous_table: Optional[str] = None,
+    output_debug_files: Optional[str] = None,
+    max_time_for_loop: Optional[int] = None,
+    # Deduplication Arguments
+    method: Optional[str] = None,
+    similarity_threshold: Optional[int] = None,
+    merge_sentiment: Optional[str] = None,
+    merge_general_topics: Optional[str] = None,
+    # Summarisation Arguments
+    summary_format: Optional[str] = None,
+    sample_reference_table: Optional[str] = None,
+    no_of_sampled_summaries: Optional[int] = None,
+    random_seed: Optional[int] = None,
+    # Output Format Arguments
+    create_xlsx_output: Optional[bool] = None,
+    # Logging Arguments
+    save_logs_to_csv: Optional[bool] = None,
+    save_logs_to_dynamodb: Optional[bool] = None,
+    cost_code: Optional[str] = None,
+) -> bool:
+    """
+    Executes the app.py script in direct mode with specified environment variables.
+    Args:
+        app_path (str): The path to the app.py script.
+        task (str): The main task to perform ('extract', 'validate', 'deduplicate', 'summarise', 'overall_summary', or 'all_in_one').
+        output_dir (str): The path to the directory for output files.
+        input_file (str, optional): Path to the input file to process.
+        text_column (str, optional): Name of the text column to process.
+        previous_output_files (List[str], optional): Path(s) to previous output files.
+        timeout (int): Timeout in seconds for the subprocess.
+        All other arguments match the CLI arguments from cli_topics.py, but are set as environment variables.
+    Returns:
+        bool: True if the script executed successfully, False otherwise.
+    """
+    # 1. Get absolute paths and perform pre-checks
+    app_abs_path = os.path.abspath(app_path)
+    output_abs_dir = os.path.abspath(output_dir)
+    # Handle input file based on task
+    if task in ["extract", "validate", "all_in_one"] and input_file is None:
+        raise ValueError(f"Input file is required for '{task}' task")
+    if input_file:
+        input_abs_path = os.path.abspath(input_file)
+        if not os.path.isfile(input_abs_path):
+            raise FileNotFoundError(f"Input file not found: {input_abs_path}")
+    if not os.path.isfile(app_abs_path):
+        raise FileNotFoundError(f"App script not found: {app_abs_path}")
+    if not os.path.isdir(output_abs_dir):
+        # Create the output directory if it doesn't exist
+        print(f"Output directory not found. Creating: {output_abs_dir}")
+        os.makedirs(output_abs_dir)
+    script_folder = os.path.dirname(app_abs_path)
+    # 2. Build environment variables for direct mode
+    env = os.environ.copy()
+    env["PYTHONUNBUFFERED"] = "1"
+    env["RUN_INFERENCE_SERVER"] = "1"
+    env["USE_MOCK_LLM"] = "1"
+    env["TEST_MODE"] = "1"
+    # Enable direct mode
+    env["RUN_DIRECT_MODE"] = "1"
+    # Task selection
+    env["DIRECT_MODE_TASK"] = task
+    # General arguments
+    if input_file:
+        # Use pipe separator to handle file paths with spaces
+        env["DIRECT_MODE_INPUT_FILE"] = input_abs_path
+    env["DIRECT_MODE_OUTPUT_DIR"] = output_abs_dir
+    if text_column:
+        env["DIRECT_MODE_TEXT_COLUMN"] = text_column
+    if previous_output_files:
+        # Use pipe separator to handle file paths with spaces
+        env["DIRECT_MODE_PREVIOUS_OUTPUT_FILES"] = "|".join(previous_output_files)
+    if username:
+        env["DIRECT_MODE_USERNAME"] = username
+    if save_to_user_folders is not None:
+        env["SESSION_OUTPUT_FOLDER"] = str(save_to_user_folders)
+    if excel_sheets:
+        env["DIRECT_MODE_EXCEL_SHEETS"] = ",".join(excel_sheets)
+    if group_by:
+        env["DIRECT_MODE_GROUP_BY"] = group_by
+    # Model configuration
+    if model_choice:
+        env["DIRECT_MODE_MODEL_CHOICE"] = model_choice
+    if temperature is not None:
+        env["DIRECT_MODE_TEMPERATURE"] = str(temperature)
+    if batch_size is not None:
+        env["DIRECT_MODE_BATCH_SIZE"] = str(batch_size)
+    if max_tokens is not None:
+        env["DIRECT_MODE_MAX_TOKENS"] = str(max_tokens)
+    if api_url:
+        env["API_URL"] = api_url
+    if inference_server_model:
+        env["DIRECT_MODE_INFERENCE_SERVER_MODEL"] = inference_server_model
+    # Topic extraction arguments
+    if context:
+        env["DIRECT_MODE_CONTEXT"] = context
+    if candidate_topics:
+        env["DIRECT_MODE_CANDIDATE_TOPICS"] = candidate_topics
+    if force_zero_shot:
+        env["DIRECT_MODE_FORCE_ZERO_SHOT"] = force_zero_shot
+    if force_single_topic:
+        env["DIRECT_MODE_FORCE_SINGLE_TOPIC"] = force_single_topic
+    if produce_structured_summary:
+        env["DIRECT_MODE_PRODUCE_STRUCTURED_SUMMARY"] = produce_structured_summary
+    if sentiment:
+        env["DIRECT_MODE_SENTIMENT"] = sentiment
+    if additional_summary_instructions:
+        env["DIRECT_MODE_ADDITIONAL_SUMMARY_INSTRUCTIONS"] = (
+            additional_summary_instructions
+        )
+    # Validation arguments
+    if additional_validation_issues:
+        env["DIRECT_MODE_ADDITIONAL_VALIDATION_ISSUES"] = additional_validation_issues
+    if show_previous_table:
+        env["DIRECT_MODE_SHOW_PREVIOUS_TABLE"] = show_previous_table
+    if output_debug_files:
+        env["OUTPUT_DEBUG_FILES"] = output_debug_files
+    if max_time_for_loop is not None:
+        env["DIRECT_MODE_MAX_TIME_FOR_LOOP"] = str(max_time_for_loop)
+    # Deduplication arguments
+    if method:
+        env["DIRECT_MODE_DEDUP_METHOD"] = method
+    if similarity_threshold is not None:
+        env["DIRECT_MODE_SIMILARITY_THRESHOLD"] = str(similarity_threshold)
+    if merge_sentiment:
+        env["DIRECT_MODE_MERGE_SENTIMENT"] = merge_sentiment
+    if merge_general_topics:
+        env["DIRECT_MODE_MERGE_GENERAL_TOPICS"] = merge_general_topics
+    # Summarisation arguments
+    if summary_format:
+        env["DIRECT_MODE_SUMMARY_FORMAT"] = summary_format
+    if sample_reference_table:
+        env["DIRECT_MODE_SAMPLE_REFERENCE_TABLE"] = sample_reference_table
+    if no_of_sampled_summaries is not None:
+        env["DIRECT_MODE_NO_OF_SAMPLED_SUMMARIES"] = str(no_of_sampled_summaries)
+    if random_seed is not None:
+        env["DIRECT_MODE_RANDOM_SEED"] = str(random_seed)
+    # Output format arguments
+    if create_xlsx_output is not None:
+        env["DIRECT_MODE_CREATE_XLSX_OUTPUT"] = str(create_xlsx_output)
+    # Logging arguments
+    if save_logs_to_csv is not None:
+        env["SAVE_LOGS_TO_CSV"] = str(save_logs_to_csv)
+    if save_logs_to_dynamodb is not None:
+        env["SAVE_LOGS_TO_DYNAMODB"] = str(save_logs_to_dynamodb)
+    if cost_code:
+        env["DEFAULT_COST_CODE"] = cost_code
+    # 3. Build command (just run app.py, no arguments needed in direct mode)
+    command = ["python", app_abs_path]
+    command_str = " ".join(str(arg) for arg in command)
+    print(f"Executing direct mode command: {command_str}")
+    print(f"Direct mode task: {task}")
+    if input_file:
+        print(f"Input file: {input_abs_path}")
+    if text_column:
+        print(f"Text column: {text_column}")
+    # 4. Execute the command using subprocess
+    try:
+        result = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Combine stderr with stdout to avoid deadlocks
+            text=True,
+            cwd=script_folder,  # Important for relative paths within the script
+            env=env,
+            bufsize=0,  # Unbuffered
+        )
+        # Read output in real-time to avoid deadlocks
+        start_time = time.time()
+        # For Windows, we need a different approach
+        if sys.platform == "win32":
+            # On Windows, use communicate with timeout
+            try:
+                stdout, stderr = result.communicate(timeout=timeout)
+            except subprocess.TimeoutExpired:
+                result.kill()
+                stdout, stderr = result.communicate()
+                raise subprocess.TimeoutExpired(result.args, timeout)
+        else:
+            # On Unix, we can use select for real-time reading
+            import select
+            stdout_lines = []
+            while result.poll() is None:
+                ready, _, _ = select.select([result.stdout], [], [], 0.1)
+                if ready:
+                    line = result.stdout.readline()
+                    if line:
+                        print(line.rstrip(), flush=True)
+                        stdout_lines.append(line)
+                # Check timeout
+                if time.time() - start_time > timeout:
+                    result.kill()
+                    raise subprocess.TimeoutExpired(result.args, timeout)
+            # Read remaining output
+            remaining = result.stdout.read()
+            if remaining:
+                print(remaining, end="", flush=True)
+                stdout_lines.append(remaining)
+            stdout = "".join(stdout_lines)
+            stderr = ""  # Combined with stdout
+        print("--- SCRIPT STDOUT ---")
+        if stdout:
+            print(stdout)
+        print("--- SCRIPT STDERR ---")
+        if stderr:
+            print(stderr)
+        print("---------------------")
+        # Analyze the output for errors and success indicators
+        analysis = analyze_test_output(stdout, stderr)
+        if analysis["has_errors"]:
+            print("❌ Errors detected in output:")
+            for i, error_type in enumerate(analysis["error_types"]):
+                print(f"   {i+1}. {error_type}")
+            if analysis["error_messages"]:
+                print("   Error messages:")
+                for msg in analysis["error_messages"][
+                    :3
+                ]:  # Show first 3 error messages
+                    print(f"     - {msg}")
+            return False
+        elif result.returncode == 0:
+            success_msg = "✅ Script executed successfully."
+            if analysis["success_indicators"]:
+                success_msg += f" (Success indicators: {', '.join(analysis['success_indicators'][:3])})"
+            print(success_msg)
+            return True
+        else:
+            print(f"❌ Command failed with return code {result.returncode}")
+            return False
+    except subprocess.TimeoutExpired:
+        result.kill()
+        print(f"❌ Subprocess timed out after {timeout} seconds.")
+        return False
+    except Exception as e:
+        print(f"❌ An unexpected error occurred: {e}")
+        return False
+class TestCLITopicsExamples(unittest.TestCase):
+    """Test suite for CLI topic extraction examples from the epilog."""
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment before running tests."""
+        cls.script_path = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "cli_topics.py"
+        )
+        cls.example_data_dir = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "example_data"
+        )
+        cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")
+        # Verify script exists
+        if not os.path.isfile(cls.script_path):
+            raise FileNotFoundError(f"CLI script not found: {cls.script_path}")
+        print(f"Test setup complete. Script: {cls.script_path}")
+        print(f"Example data directory: {cls.example_data_dir}")
+        print(f"Temp output directory: {cls.temp_output_dir}")
+        print("Using function mocking instead of HTTP server")
+        # Debug: Check if example data directory exists and list contents
+        if os.path.exists(cls.example_data_dir):
+            print("Example data directory exists. Contents:")
+            for item in os.listdir(cls.example_data_dir):
+                item_path = os.path.join(cls.example_data_dir, item)
+                if os.path.isfile(item_path):
+                    print(f"  File: {item} ({os.path.getsize(item_path)} bytes)")
+                else:
+                    print(f"  Directory: {item}")
+        else:
+            print(f"Example data directory does not exist: {cls.example_data_dir}")
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test environment after running tests."""
+        if os.path.exists(cls.temp_output_dir):
+            shutil.rmtree(cls.temp_output_dir)
+        print(f"Cleaned up temp directory: {cls.temp_output_dir}")
+    def test_extract_topics_default_settings(self):
+        """Test: Extract topics from a CSV file with default settings"""
+        print("\n=== Testing topic extraction with default settings ===")
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(result, "Topic extraction with default settings should succeed")
+        print("✅ Topic extraction with default settings passed")
+    def test_extract_topics_custom_model_and_context(self):
+        """Test: Extract topics with custom model and context"""
+        print("\n=== Testing topic extraction with custom model and context ===")
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            context="Social Care case notes for young people",
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(
+            result, "Topic extraction with custom model and context should succeed"
+        )
+        print("✅ Topic extraction with custom model and context passed")
+    def test_extract_topics_with_grouping(self):
+        """Test: Extract topics with grouping"""
+        print("\n=== Testing topic extraction with grouping ===")
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            group_by="Client",
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(result, "Topic extraction with grouping should succeed")
+        print("✅ Topic extraction with grouping passed")
+    def test_extract_topics_with_candidate_topics(self):
+        """Test: Extract topics with candidate topics (zero-shot)"""
+        print("\n=== Testing topic extraction with candidate topics ===")
+        input_file = os.path.join(
+            self.example_data_dir, "dummy_consultation_response.csv"
+        )
+        candidate_topics_file = os.path.join(
+            self.example_data_dir, "dummy_consultation_response_themes.csv"
+        )
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        if not os.path.isfile(candidate_topics_file):
+            self.skipTest(f"Candidate topics file not found: {candidate_topics_file}")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Response text",
+            output_dir=self.temp_output_dir,
+            candidate_topics=candidate_topics_file,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(result, "Topic extraction with candidate topics should succeed")
+        print("✅ Topic extraction with candidate topics passed")
+    def test_deduplicate_topics_fuzzy(self):
+        """Test: Deduplicate topics using fuzzy matching"""
+        print("\n=== Testing topic deduplication with fuzzy matching ===")
+        # First, we need to create some output files by running extraction
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        # Run extraction first to create output files
+        extract_result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        if not extract_result:
+            self.skipTest("Extraction failed, cannot test deduplication")
+        # Find the output files (they should be in temp_output_dir)
+        # The file names follow a pattern like: {input_file_name}_col_{text_column}_reference_table.csv
+        import glob
+        reference_files = glob.glob(
+            os.path.join(self.temp_output_dir, "*reference_table.csv")
+        )
+        unique_files = glob.glob(
+            os.path.join(self.temp_output_dir, "*unique_topics.csv")
+        )
+        if not reference_files or not unique_files:
+            self.skipTest("Could not find output files from extraction")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="deduplicate",
+            previous_output_files=[reference_files[0], unique_files[0]],
+            output_dir=self.temp_output_dir,
+            method="fuzzy",
+            similarity_threshold=90,
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(
+            result, "Topic deduplication with fuzzy matching should succeed"
+        )
+        print("✅ Topic deduplication with fuzzy matching passed")
+    def test_deduplicate_topics_llm(self):
+        """Test: Deduplicate topics using LLM"""
+        print("\n=== Testing topic deduplication with LLM ===")
+        # First, we need to create some output files by running extraction
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        # Run extraction first to create output files
+        extract_result = run_cli_topics(
+            script_path=self.script_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        if not extract_result:
+            self.skipTest("Extraction failed, cannot test deduplication")
+        # Find the output files
+        import glob
+        reference_files = glob.glob(
+            os.path.join(self.temp_output_dir, "*reference_table.csv")
+        )
+        unique_files = glob.glob(
+            os.path.join(self.temp_output_dir, "*unique_topics.csv")
+        )
+        if not reference_files or not unique_files:
+            self.skipTest("Could not find output files from extraction")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="deduplicate",
+            previous_output_files=[reference_files[0], unique_files[0]],
+            output_dir=self.temp_output_dir,
+            method="llm",
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(result, "Topic deduplication with LLM should succeed")
+        print("✅ Topic deduplication with LLM passed")
+    def test_all_in_one_pipeline(self):
+        """Test: Run complete pipeline (extract, deduplicate, summarise)"""
+        print("\n=== Testing all-in-one pipeline ===")
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        result = run_cli_topics(
+            script_path=self.script_path,
+            task="all_in_one",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",  # URL doesn't matter with function mocking
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+            timeout=120,  # Shorter timeout for debugging
+        )
+        self.assertTrue(result, "All-in-one pipeline should succeed")
+        print("✅ All-in-one pipeline passed")
+    def test_direct_mode_extract(self):
+        """Test: Run app in direct mode for topic extraction"""
+        print("\n=== Testing direct mode - topic extraction ===")
+        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+        if not os.path.isfile(input_file):
+            self.skipTest(f"Example file not found: {input_file}")
+        app_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "app.py")
+        if not os.path.isfile(app_path):
+            self.skipTest(f"App script not found: {app_path}")
+        result = run_app_direct_mode(
+            app_path=app_path,
+            task="extract",
+            input_file=input_file,
+            text_column="Case Note",
+            output_dir=self.temp_output_dir,
+            model_choice="test-model",
+            inference_server_model="test-model",
+            api_url="http://localhost:8080",
+            create_xlsx_output=False,
+            save_logs_to_csv=False,
+        )
+        self.assertTrue(result, "Direct mode topic extraction should succeed")
+        print("✅ Direct mode topic extraction passed")
+def run_all_tests():
+    """Run all test examples and report results."""
+    print("=" * 80)
+    print("LLM TOPIC MODELLER TEST SUITE")
+    print("=" * 80)
+    print("This test suite includes:")
+    print("- CLI examples from the epilog")
+    print("- GUI application tests")
+    print("- Tests use a mock inference-server to avoid API costs")
+    print("Tests will be skipped if required example files are not found.")
+    print("=" * 80)
+    # Create test suite
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    # Add CLI tests
+    cli_suite = loader.loadTestsFromTestCase(TestCLITopicsExamples)
+    suite.addTests(cli_suite)
+    # Add GUI tests
+    try:
+        from test.test_gui_only import TestGUIAppOnly
+        gui_suite = loader.loadTestsFromTestCase(TestGUIAppOnly)
+        suite.addTests(gui_suite)
+        print("GUI tests included in test suite.")
+    except ImportError as e:
+        print(f"Warning: Could not import GUI tests: {e}")
+        print("Skipping GUI tests.")
+    # Run tests with detailed output
+    runner = unittest.TextTestRunner(verbosity=2, stream=None)
+    result = runner.run(suite)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("TEST SUMMARY")
+    print("=" * 80)
+    print(f"Tests run: {result.testsRun}")
+    print(f"Failures: {len(result.failures)}")
+    print(f"Errors: {len(result.errors)}")
+    print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
+    if result.failures:
+        print("\nFAILURES:")
+        for test, traceback in result.failures:
+            print(f"- {test}: {traceback}")
+    if result.errors:
+        print("\nERRORS:")
+        for test, traceback in result.errors:
+            print(f"- {test}: {traceback}")
+    success = len(result.failures) == 0 and len(result.errors) == 0
+    print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
+    print("=" * 80)
+    return success
+if __name__ == "__main__":
+    # Run the test suite
+    success = run_all_tests()
+    exit(0 if success else 1)

test/test_gui_only.py ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env python3
+"""
+Standalone GUI test script for the LLM topic modeller application.
+This script tests only the GUI functionality of app.py to ensure it loads correctly.
+Run this script to verify that the Gradio interface can be imported and initialized.
+"""
+import os
+import sys
+import threading
+import unittest
+# Add the parent directory to the path so we can import the app
+parent_dir = os.path.dirname(os.path.dirname(__file__))
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+class TestGUIAppOnly(unittest.TestCase):
+    """Test suite for GUI application loading and basic functionality."""
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment for GUI tests."""
+        cls.app_path = os.path.join(parent_dir, "app.py")
+        # Verify app.py exists
+        if not os.path.isfile(cls.app_path):
+            raise FileNotFoundError(f"App file not found: {cls.app_path}")
+        print(f"GUI test setup complete. App: {cls.app_path}")
+    def test_app_import_and_initialization(self):
+        """Test: Import app.py and check if the Gradio app object is created successfully."""
+        print("\n=== Testing GUI app import and initialization ===")
+        try:
+            # Import the app module
+            import app
+            # Check if the app object exists and is a Gradio Blocks object
+            self.assertTrue(
+                hasattr(app, "app"), "App object should exist in the module"
+            )
+            # Check if it's a Gradio Blocks instance
+            import gradio as gr
+            self.assertIsInstance(
+                app.app, gr.Blocks, "App should be a Gradio Blocks instance"
+            )
+            print("✅ GUI app import and initialization passed")
+        except ImportError as e:
+            error_msg = f"Failed to import app module: {e}"
+            self.fail(error_msg)
+        except Exception as e:
+            self.fail(f"Unexpected error during app initialization: {e}")
+    def test_app_launch_headless(self):
+        """Test: Launch the app in headless mode to verify it starts without errors."""
+        print("\n=== Testing GUI app launch in headless mode ===")
+        try:
+            # Import the app module
+            import app
+            # Set up a flag to track if the app launched successfully
+            app_launched = threading.Event()
+            launch_error = None
+            def launch_app():
+                try:
+                    # Launch the app in headless mode with a short timeout
+                    app.app.launch(
+                        show_error=True,
+                        inbrowser=False,  # Don't open browser
+                        server_port=0,  # Use any available port
+                        quiet=True,  # Suppress output
+                        prevent_thread_lock=True,  # Don't block the main thread
+                    )
+                    app_launched.set()
+                except Exception:
+                    app_launched.set()
+            # Start the app in a separate thread
+            launch_thread = threading.Thread(target=launch_app)
+            launch_thread.daemon = True
+            launch_thread.start()
+            # Wait for the app to launch (with timeout)
+            if app_launched.wait(timeout=10):  # 10 second timeout
+                if launch_error:
+                    self.fail(f"App launch failed: {launch_error}")
+                else:
+                    print("✅ GUI app launch in headless mode passed")
+            else:
+                self.fail("App launch timed out after 10 seconds")
+        except Exception as e:
+            error_msg = f"Unexpected error during app launch test: {e}"
+            self.fail(error_msg)
+    def test_app_configuration_loading(self):
+        """Test: Verify that the app can load its configuration without errors."""
+        print("\n=== Testing GUI app configuration loading ===")
+        try:
+            # Check if key configuration variables are accessible
+            # These should be imported from tools.config
+            from tools.config import (
+                DEFAULT_COST_CODE,
+                GRADIO_SERVER_PORT,
+                MAX_FILE_SIZE,
+                default_model_choice,
+                model_name_map,
+            )
+            # Verify these are not None/empty
+            self.assertIsNotNone(
+                GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
+            )
+            self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+            self.assertIsNotNone(
+                DEFAULT_COST_CODE, "DEFAULT_COST_CODE should be configured"
+            )
+            self.assertIsNotNone(
+                default_model_choice, "default_model_choice should be configured"
+            )
+            self.assertIsNotNone(model_name_map, "model_name_map should be configured")
+            print("✅ GUI app configuration loading passed")
+        except ImportError as e:
+            error_msg = f"Failed to import configuration: {e}"
+            self.fail(error_msg)
+        except Exception as e:
+            error_msg = f"Unexpected error during configuration test: {e}"
+            self.fail(error_msg)
+def run_gui_tests():
+    """Run GUI tests and report results."""
+    print("=" * 80)
+    print("LLM TOPIC MODELLER GUI TEST SUITE")
+    print("=" * 80)
+    print("This test suite verifies that the GUI application loads correctly.")
+    print("=" * 80)
+    # Create test suite
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromTestCase(TestGUIAppOnly)
+    # Run tests with detailed output
+    runner = unittest.TextTestRunner(verbosity=2, stream=None)
+    result = runner.run(suite)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("GUI TEST SUMMARY")
+    print("=" * 80)
+    print(f"Tests run: {result.testsRun}")
+    print(f"Failures: {len(result.failures)}")
+    print(f"Errors: {len(result.errors)}")
+    print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
+    if result.failures:
+        print("\nFAILURES:")
+        for test, traceback in result.failures:
+            print(f"- {test}: {traceback}")
+    if result.errors:
+        print("\nERRORS:")
+        for test, traceback in result.errors:
+            print(f"- {test}: {traceback}")
+    success = len(result.failures) == 0 and len(result.errors) == 0
+    print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
+    print("=" * 80)
+    return success
+if __name__ == "__main__":
+    # Run the GUI test suite
+    success = run_gui_tests()
+    exit(0 if success else 1)

tools/__init__.py ADDED Viewed

File without changes

tools/auth.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import base64
+import hashlib
+import hmac
+import boto3
+from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_REGION, AWS_USER_POOL_ID
+def calculate_secret_hash(client_id: str, client_secret: str, username: str):
+    message = username + client_id
+    dig = hmac.new(
+        str(client_secret).encode("utf-8"),
+        msg=str(message).encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    secret_hash = base64.b64encode(dig).decode()
+    return secret_hash
+def authenticate_user(
+    username: str,
+    password: str,
+    user_pool_id: str = AWS_USER_POOL_ID,
+    client_id: str = AWS_CLIENT_ID,
+    client_secret: str = AWS_CLIENT_SECRET,
+):
+    """Authenticates a user against an AWS Cognito user pool.
+    Args:
+        user_pool_id (str): The ID of the Cognito user pool.
+        client_id (str): The ID of the Cognito user pool client.
+        username (str): The username of the user.
+        password (str): The password of the user.
+        client_secret (str): The client secret of the app client
+    Returns:
+        bool: True if the user is authenticated, False otherwise.
+    """
+    client = boto3.client(
+        "cognito-idp", region_name=AWS_REGION
+    )  # Cognito Identity Provider client
+    # Compute the secret hash
+    secret_hash = calculate_secret_hash(client_id, client_secret, username)
+    try:
+        if client_secret == "":
+            response = client.initiate_auth(
+                AuthFlow="USER_PASSWORD_AUTH",
+                AuthParameters={
+                    "USERNAME": username,
+                    "PASSWORD": password,
+                },
+                ClientId=client_id,
+            )
+        else:
+            response = client.initiate_auth(
+                AuthFlow="USER_PASSWORD_AUTH",
+                AuthParameters={
+                    "USERNAME": username,
+                    "PASSWORD": password,
+                    "SECRET_HASH": secret_hash,
+                },
+                ClientId=client_id,
+            )
+        # If successful, you'll receive an AuthenticationResult in the response
+        if response.get("AuthenticationResult"):
+            return True
+        else:
+            return False
+    except client.exceptions.NotAuthorizedException:
+        return False
+    except client.exceptions.UserNotFoundException:
+        return False
+    except Exception as e:
+        out_message = f"An error occurred: {e}"
+        print(out_message)
+        raise Exception(out_message)
+        return False

tools/aws_functions.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import os
+from typing import List
+import boto3
+from tools.config import (
+    AWS_ACCESS_KEY,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+    RUN_AWS_FUNCTIONS,
+    S3_LOG_BUCKET,
+    S3_OUTPUTS_BUCKET,
+)
+# Empty bucket name in case authentication fails
+bucket_name = S3_LOG_BUCKET
+def connect_to_bedrock_runtime(
+    model_name_map: dict,
+    model_choice: str,
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+):
+    # If running an anthropic model, assume that running an AWS Bedrock model, load in Bedrock
+    model_source = model_name_map[model_choice]["source"]
+    # Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
+    region = aws_region_textbox if aws_region_textbox else AWS_REGION
+    if "AWS" in model_source:
+        if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+            print("Connecting to Bedrock via existing SSO connection")
+            bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+        elif aws_access_key_textbox and aws_secret_key_textbox:
+            print(
+                "Connecting to Bedrock using AWS access key and secret keys from user input."
+            )
+            bedrock_runtime = boto3.client(
+                "bedrock-runtime",
+                aws_access_key_id=aws_access_key_textbox,
+                aws_secret_access_key=aws_secret_key_textbox,
+                region_name=region,
+            )
+        elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            print("Getting Bedrock credentials from environment variables")
+            bedrock_runtime = boto3.client(
+                "bedrock-runtime",
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY,
+                region_name=region,
+            )
+        elif RUN_AWS_FUNCTIONS == "1":
+            print("Connecting to Bedrock via existing SSO connection")
+            bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+        else:
+            bedrock_runtime = ""
+            out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
+            print(out_message)
+            raise Exception(out_message)
+    else:
+        bedrock_runtime = None
+    return bedrock_runtime
+def connect_to_s3_client(
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+):
+    # If running an anthropic model, assume that running an AWS s3 model, load in s3
+    s3_client = None
+    # Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
+    region = aws_region_textbox if aws_region_textbox else AWS_REGION
+    if aws_access_key_textbox and aws_secret_key_textbox:
+        print("Connecting to s3 using AWS access key and secret keys from user input.")
+        s3_client = boto3.client(
+            "s3",
+            aws_access_key_id=aws_access_key_textbox,
+            aws_secret_access_key=aws_secret_key_textbox,
+            region_name=region,
+        )
+    elif RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+        print("Connecting to s3 via existing SSO connection")
+        s3_client = boto3.client("s3", region_name=region)
+    elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+        print("Getting s3 credentials from environment variables")
+        s3_client = boto3.client(
+            "s3",
+            aws_access_key_id=AWS_ACCESS_KEY,
+            aws_secret_access_key=AWS_SECRET_KEY,
+            region_name=region,
+        )
+    elif RUN_AWS_FUNCTIONS == "1":
+        print("Connecting to s3 via existing SSO connection")
+        s3_client = boto3.client("s3", region_name=region)
+    else:
+        s3_client = ""
+        out_message = "Cannot connect to S3 service. Please provide access keys under LLM settings, or choose another model type."
+        print(out_message)
+        raise Exception(out_message)
+    return s3_client
+# Download direct from S3 - requires login credentials
+def download_file_from_s3(
+    bucket_name: str,
+    key: str,
+    local_file_path: str,
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+):
+    if RUN_AWS_FUNCTIONS == "1":
+        s3 = connect_to_s3_client(
+            aws_access_key_textbox, aws_secret_key_textbox, aws_region_textbox
+        )
+        # boto3.client('s3')
+        s3.download_file(bucket_name, key, local_file_path)
+        print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
+def download_folder_from_s3(
+    bucket_name: str,
+    s3_folder: str,
+    local_folder: str,
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+):
+    """
+    Download all files from an S3 folder to a local folder.
+    """
+    if RUN_AWS_FUNCTIONS == "1":
+        s3 = connect_to_s3_client(
+            aws_access_key_textbox, aws_secret_key_textbox, aws_region_textbox
+        )
+        # boto3.client('s3')
+        # List objects in the specified S3 folder
+        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+        # Download each object
+        for obj in response.get("Contents", []):
+            # Extract object key and construct local file path
+            object_key = obj["Key"]
+            local_file_path = os.path.join(
+                local_folder, os.path.relpath(object_key, s3_folder)
+            )
+            # Create directories if necessary
+            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+            # Download the object
+            try:
+                s3.download_file(bucket_name, object_key, local_file_path)
+                print(
+                    f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+                )
+            except Exception as e:
+                print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def download_files_from_s3(
+    bucket_name: str,
+    s3_folder: str,
+    local_folder: str,
+    filenames: list[str],
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+):
+    """
+    Download specific files from an S3 folder to a local folder.
+    """
+    if RUN_AWS_FUNCTIONS == "1":
+        s3 = connect_to_s3_client(
+            aws_access_key_textbox, aws_secret_key_textbox, aws_region_textbox
+        )
+        # boto3.client('s3')
+        print("Trying to download file: ", filenames)
+        if filenames == "*":
+            # List all objects in the S3 folder
+            print("Trying to download all files in AWS folder: ", s3_folder)
+            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+            print("Found files in AWS folder: ", response.get("Contents", []))
+            filenames = [
+                obj["Key"].split("/")[-1] for obj in response.get("Contents", [])
+            ]
+            print("Found filenames in AWS folder: ", filenames)
+        for filename in filenames:
+            object_key = os.path.join(s3_folder, filename)
+            local_file_path = os.path.join(local_folder, filename)
+            # Create directories if necessary
+            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+            # Download the object
+            try:
+                s3.download_file(bucket_name, object_key, local_file_path)
+                print(
+                    f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+                )
+            except Exception as e:
+                print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def upload_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str = bucket_name,
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
+    aws_region_textbox: str = "",
+    RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    if RUN_AWS_FUNCTIONS == "1":
+        final_out_message = list()
+        s3_client = connect_to_s3_client(
+            aws_access_key_textbox, aws_secret_key_textbox, aws_region_textbox
+        )
+        # boto3.client('s3')
+        if isinstance(local_file_paths, str):
+            local_file_paths = [local_file_paths]
+        for file in local_file_paths:
+            try:
+                # Get file name off file path
+                file_name = os.path.basename(file)
+                s3_key_full = s3_key + file_name
+                print("S3 key: ", s3_key_full)
+                s3_client.upload_file(file, s3_bucket, s3_key_full)
+                out_message = "File " + file_name + " uploaded successfully!"
+                print(out_message)
+            except Exception as e:
+                out_message = f"Error uploading file(s): {e}"
+                print(out_message)
+            final_out_message.append(out_message)
+            final_out_message_str = "\n".join(final_out_message)
+    else:
+        final_out_message_str = "Not connected to AWS, no files uploaded."
+    return final_out_message_str
+# Helper to upload outputs to S3 when enabled in config.
+def export_outputs_to_s3(
+    file_list_state,
+    s3_output_folder_state_value: str,
+    save_outputs_to_s3_flag: bool,
+    base_file_state=None,
+    s3_bucket: str = S3_OUTPUTS_BUCKET,
+):
+    """
+    Upload a list of local output files to the configured S3 outputs folder.
+    - file_list_state: Gradio dropdown state that holds a list of file paths or a
+        single path/string. If blank/empty, no action is taken.
+    - s3_output_folder_state_value: Final S3 key prefix (including any session hash)
+        to use as the destination folder for uploads.
+    - s3_bucket: Name of the S3 bucket.
+    """
+    try:
+        # Respect the runtime toggle as well as environment configuration
+        if not save_outputs_to_s3_flag:
+            return
+        if not s3_output_folder_state_value:
+            # No configured S3 outputs folder – nothing to do
+            return
+        # Normalise input to a Python list of strings
+        file_paths = file_list_state
+        if not file_paths:
+            return
+        # Gradio dropdown may return a single string or a list
+        if isinstance(file_paths, str):
+            file_paths = [file_paths]
+        # Filter out any non-truthy values
+        file_paths = [p for p in file_paths if p]
+        if not file_paths:
+            return
+        # Derive a base file stem (name without extension) from the original
+        # file(s) being analysed, if provided. This is used to create an
+        # additional subfolder layer so that outputs are grouped under the
+        # analysed file name rather than under each output file name.
+        base_stem = None
+        if base_file_state:
+            base_path = None
+            # Gradio File components typically provide a list of objects with a `.name` attribute
+            if isinstance(base_file_state, str):
+                base_path = base_file_state
+            elif isinstance(base_file_state, list) and base_file_state:
+                first_item = base_file_state[0]
+                base_path = getattr(first_item, "name", None) or str(first_item)
+            else:
+                base_path = getattr(base_file_state, "name", None) or str(
+                    base_file_state
+                )
+            if base_path:
+                base_name = os.path.basename(base_path)
+                base_stem, _ = os.path.splitext(base_name)
+        # Ensure base S3 prefix (session/date) ends with a trailing slash
+        base_prefix = s3_output_folder_state_value
+        if not base_prefix.endswith("/"):
+            base_prefix = base_prefix + "/"
+        # For each file, append a subfolder. If we have a derived base_stem
+        # from the input being analysed, use that; otherwise, fall back to
+        # the individual output file name stem. Final pattern:
+        #   <session_output_folder>/<date>/<base_file_stem>/<file_name>
+        # or, if base_file_stem is not available:
+        #   <session_output_folder>/<date>/<output_file_stem>/<file_name>
+        for file in file_paths:
+            file_name = os.path.basename(file)
+            if base_stem:
+                folder_stem = base_stem
+            else:
+                folder_stem, _ = os.path.splitext(file_name)
+            per_file_prefix = base_prefix + folder_stem + "/"
+            out_message = upload_file_to_s3(
+                local_file_paths=[file],
+                s3_key=per_file_prefix,
+                s3_bucket=s3_bucket,
+            )
+            # Log any issues to console so failures are visible in logs/stdout
+            if (
+                "Error uploading file" in out_message
+                or "could not upload" in out_message.lower()
+            ):
+                print("export_outputs_to_s3 encountered issues:", out_message)
+        print("Successfully uploaded outputs to S3")
+    except Exception as e:
+        # Do not break the app flow if S3 upload fails – just report to console
+        print(f"export_outputs_to_s3 failed with error: {e}")
+    # No GUI outputs to update
+    return

tools/combine_sheets_into_xlsx.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import os
+from datetime import date, datetime
+from typing import List
+import pandas as pd
+from openpyxl import Workbook
+from openpyxl.styles import Alignment, Font
+from openpyxl.utils import get_column_letter
+from openpyxl.utils.dataframe import dataframe_to_rows
+from tools.config import OUTPUT_FOLDER
+from tools.config import model_name_map as global_model_name_map
+from tools.helper_functions import (
+    clean_column_name,
+    convert_reference_table_to_pivot_table,
+    ensure_model_in_map,
+    get_basic_response_data,
+    load_in_data_file,
+)
+def add_cover_sheet(
+    wb: Workbook,
+    intro_paragraphs: list[str],
+    model_name: str,
+    analysis_date: str,
+    analysis_cost: str,
+    number_of_responses: int,
+    number_of_responses_with_text: int,
+    number_of_responses_with_text_five_plus_words: int,
+    llm_call_number: int,
+    input_tokens: int,
+    output_tokens: int,
+    time_taken: float,
+    file_name: str,
+    column_name: str,
+    number_of_responses_with_topic_assignment: int,
+    custom_title: str = "Cover Sheet",
+):
+    ws = wb.create_sheet(title=custom_title, index=0)
+    # Freeze top row
+    ws.freeze_panes = "A2"
+    # Write title
+    ws["A1"] = "Large Language Model Topic analysis"
+    ws["A1"].font = Font(size=14, bold=True)
+    ws["A1"].alignment = Alignment(wrap_text=True, vertical="top")
+    # Add intro paragraphs
+    row = 3
+    for paragraph in intro_paragraphs:
+        ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=2)
+        cell = ws.cell(row=row, column=1, value=paragraph)
+        cell.alignment = Alignment(wrap_text=True, vertical="top")
+        ws.row_dimensions[row].height = 60  # Adjust height as needed
+        row += 2
+    # Add metadata
+    meta_start = row + 1
+    metadata = {
+        "Date Excel file created": date.today().strftime("%Y-%m-%d"),
+        "File name": file_name,
+        "Column name": column_name,
+        "Model name": model_name,
+        "Analysis date": analysis_date,
+        # "Analysis cost": analysis_cost,
+        "Number of responses": number_of_responses,
+        "Number of responses with text": number_of_responses_with_text,
+        "Number of responses with text five plus words": number_of_responses_with_text_five_plus_words,
+        "Number of responses with at least one assigned topic": number_of_responses_with_topic_assignment,
+        "Number of LLM calls": llm_call_number,
+        "Total number of input tokens from LLM calls": input_tokens,
+        "Total number of output tokens from LLM calls": output_tokens,
+        "Total time taken for all LLM calls (seconds)": round(float(time_taken), 1),
+    }
+    for i, (label, value) in enumerate(metadata.items()):
+        row_num = meta_start + i
+        ws[f"A{row_num}"] = label
+        ws[f"A{row_num}"].font = Font(bold=True)
+        cell = ws[f"B{row_num}"]
+        cell.value = value
+        cell.alignment = Alignment(wrap_text=True)
+        # Optional: Adjust column widths
+        ws.column_dimensions["A"].width = 25
+        ws.column_dimensions["B"].width = 75
+    # Ensure first row cells are wrapped on the cover sheet
+    for col_idx in range(1, ws.max_column + 1):
+        header_cell = ws.cell(row=1, column=col_idx)
+        header_cell.alignment = Alignment(wrap_text=True, vertical="center")
+def csvs_to_excel(
+    csv_files: list[str],
+    output_filename: str,
+    sheet_names: list[str] = None,
+    column_widths: dict = None,  # Dict of {sheet_name: {col_letter: width}}
+    wrap_text_columns: dict = None,  # Dict of {sheet_name: [col_letters]}
+    intro_text: list[str] = None,
+    model_name: str = "",
+    analysis_date: str = "",
+    analysis_cost: str = "",
+    llm_call_number: int = 0,
+    input_tokens: int = 0,
+    output_tokens: int = 0,
+    time_taken: float = 0,
+    number_of_responses: int = 0,
+    number_of_responses_with_text: int = 0,
+    number_of_responses_with_text_five_plus_words: int = 0,
+    column_name: str = "",
+    number_of_responses_with_topic_assignment: int = 0,
+    file_name: str = "",
+    unique_reference_numbers: list = [],
+):
+    if intro_text is None:
+        intro_text = list()
+    wb = Workbook()
+    # Remove default sheet
+    wb.remove(wb.active)
+    for idx, csv_path in enumerate(csv_files):
+        # Use provided sheet name or derive from file name
+        sheet_name = (
+            sheet_names[idx]
+            if sheet_names and idx < len(sheet_names)
+            else os.path.splitext(os.path.basename(csv_path))[0]
+        )
+        df = pd.read_csv(csv_path)
+        if sheet_name == "Original data":
+            try:
+                # Create a copy to avoid modifying the original
+                df_copy = df.copy()
+                # Insert the Reference column at position 0 (first column)
+                df_copy.insert(0, "Reference", unique_reference_numbers)
+                df = df_copy
+            except Exception as e:
+                print("Could not add reference number to original data due to:", e)
+        ws = wb.create_sheet(title=sheet_name)
+        for r_idx, row in enumerate(
+            dataframe_to_rows(df, index=False, header=True), start=1
+        ):
+            ws.append(row)
+            for col_idx, value in enumerate(row, start=1):
+                cell = ws.cell(row=r_idx, column=col_idx)
+                # Bold header row
+                if r_idx == 1:
+                    cell.font = Font(bold=True)
+                # Set vertical alignment to middle by default
+                cell.alignment = Alignment(vertical="center")
+            # Apply wrap text if needed
+            if wrap_text_columns and sheet_name in wrap_text_columns:
+                for col_letter in wrap_text_columns[sheet_name]:
+                    cell = ws[f"{col_letter}{r_idx}"]
+                    cell.alignment = Alignment(vertical="center", wrap_text=True)
+        # Freeze top row for all data sheets
+        ws.freeze_panes = "A2"
+        # Ensure all header cells (first row) are wrapped
+        for col_idx in range(1, ws.max_column + 1):
+            header_cell = ws.cell(row=1, column=col_idx)
+            header_cell.alignment = Alignment(vertical="center", wrap_text=True)
+        # Set column widths
+        if column_widths and sheet_name in column_widths:
+            for col_letter, width in column_widths[sheet_name].items():
+                ws.column_dimensions[col_letter].width = width
+    add_cover_sheet(
+        wb,
+        intro_paragraphs=intro_text,
+        model_name=model_name,
+        analysis_date=analysis_date,
+        analysis_cost=analysis_cost,
+        number_of_responses=number_of_responses,
+        number_of_responses_with_text=number_of_responses_with_text,
+        number_of_responses_with_text_five_plus_words=number_of_responses_with_text_five_plus_words,
+        llm_call_number=llm_call_number,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        time_taken=time_taken,
+        file_name=file_name,
+        column_name=column_name,
+        number_of_responses_with_topic_assignment=number_of_responses_with_topic_assignment,
+    )
+    wb.save(output_filename)
+    print(f"Output xlsx summary saved as '{output_filename}'")
+    return output_filename
+###
+# Run the functions
+###
+def collect_output_csvs_and_create_excel_output(
+    in_data_files: List,
+    chosen_cols: list[str],
+    reference_data_file_name_textbox: str,
+    in_group_col: str,
+    model_choice: str,
+    master_reference_df_state: pd.DataFrame,
+    master_unique_topics_df_state: pd.DataFrame,
+    summarised_output_df: pd.DataFrame,
+    missing_df_state: pd.DataFrame,
+    excel_sheets: str = "",
+    usage_logs_location: str = "",
+    model_name_map: dict = dict(),
+    output_folder: str = OUTPUT_FOLDER,
+    structured_summaries: str = "No",
+):
+    """
+    Collect together output CSVs from various output boxes and combine them into a single output Excel file.
+    Args:
+        in_data_files (List): A list of paths to the input data files.
+        chosen_cols (list[str]): A list of column names selected for analysis.
+        reference_data_file_name_textbox (str): The name of the reference data file.
+        in_group_col (str): The column used for grouping the data.
+        model_choice (str): The LLM model chosen for the analysis.
+        master_reference_df_state (pd.DataFrame): The master DataFrame containing reference data.
+        master_unique_topics_df_state (pd.DataFrame): The master DataFrame containing unique topics data.
+        summarised_output_df (pd.DataFrame): DataFrame containing the summarised output.
+        missing_df_state (pd.DataFrame): DataFrame containing information about missing data.
+        excel_sheets (str): Information regarding Excel sheets, typically sheet names or structure.
+        usage_logs_location (str, optional): Path to the usage logs CSV file. Defaults to "".
+        model_name_map (dict, optional): A dictionary mapping model choices to their display names. Defaults to {}.
+        output_folder (str, optional): The directory where the output Excel file will be saved. Defaults to OUTPUT_FOLDER.
+        structured_summaries (str, optional): Indicates whether structured summaries are being produced ("Yes" or "No"). Defaults to "No".
+    Returns:
+        tuple: A tuple containing:
+            - list: A list of paths to the generated Excel output files.
+            - list: A duplicate of the list of paths to the generated Excel output files (for UI compatibility).
+    """
+    # Use passed model_name_map if provided and not empty, otherwise use global one
+    if not model_name_map:
+        model_name_map = global_model_name_map
+    # Ensure custom model_choice is registered in model_name_map
+    ensure_model_in_map(model_choice, model_name_map)
+    if structured_summaries == "Yes":
+        structured_summaries = True
+    else:
+        structured_summaries = False
+    if not chosen_cols:
+        raise Exception("Could not find chosen column")
+    today_date = datetime.today().strftime("%Y-%m-%d")
+    original_data_file_path = os.path.abspath(in_data_files[0])
+    csv_files = list()
+    sheet_names = list()
+    column_widths = dict()
+    wrap_text_columns = dict()
+    short_file_name = os.path.basename(reference_data_file_name_textbox)
+    reference_pivot_table = pd.DataFrame()
+    reference_table_csv_path = ""
+    reference_pivot_table_csv_path = ""
+    unique_topic_table_csv_path = ""
+    missing_df_state_csv_path = ""
+    overall_summary_csv_path = ""
+    number_of_responses_with_topic_assignment = 0
+    if in_group_col:
+        group = in_group_col
+    else:
+        group = "All"
+    overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
+    if structured_summaries is True and not master_unique_topics_df_state.empty:
+        print("Producing overall summary based on structured summaries.")
+        # Create structured summary from master_unique_topics_df_state
+        structured_summary_data = list()
+        # Group by 'Group' column
+        for group_name, group_df in master_unique_topics_df_state.groupby("Group"):
+            group_summary = f"## {group_name}\n\n"
+            # Group by 'General topic' within each group
+            for general_topic, topic_df in group_df.groupby("General topic"):
+                group_summary += f"### {general_topic}\n\n"
+                # Add subtopics under each general topic
+                for _, row in topic_df.iterrows():
+                    subtopic = row["Subtopic"]
+                    summary = row["Summary"]
+                    # sentiment = row.get('Sentiment', '')
+                    # num_responses = row.get('Number of responses', '')
+                    # Create subtopic entry
+                    subtopic_entry = f"**{subtopic}**"
+                    # if sentiment:
+                    #     subtopic_entry += f" ({sentiment})"
+                    # if num_responses:
+                    #     subtopic_entry += f" - {num_responses} responses"
+                    subtopic_entry += "\n\n"
+                    if summary and pd.notna(summary):
+                        subtopic_entry += f"{summary}\n\n"
+                    group_summary += subtopic_entry
+            # Add to structured summary data
+            structured_summary_data.append(
+                {"Group": group_name, "Summary": group_summary.strip()}
+            )
+        # Create DataFrame for structured summary
+        structured_summary_df = pd.DataFrame(structured_summary_data)
+        structured_summary_df.to_csv(overall_summary_csv_path, index=False)
+    else:
+        # Use original summarised_output_df
+        structured_summary_df = summarised_output_df
+        structured_summary_df.to_csv(overall_summary_csv_path, index=None)
+    if not structured_summary_df.empty:
+        csv_files.append(overall_summary_csv_path)
+        sheet_names.append("Overall summary")
+        column_widths["Overall summary"] = {"A": 20, "B": 100}
+        wrap_text_columns["Overall summary"] = ["B"]
+    if not master_reference_df_state.empty:
+        # Simplify table to just responses column and the Response reference number
+        file_data, file_name, num_batches = load_in_data_file(
+            in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets
+        )
+        basic_response_data = get_basic_response_data(
+            file_data, chosen_cols, verify_titles="No"
+        )
+        reference_pivot_table = convert_reference_table_to_pivot_table(
+            master_reference_df_state, basic_response_data
+        )
+        unique_reference_numbers = basic_response_data["Reference"].tolist()
+        try:
+            master_reference_df_state.rename(
+                columns={"Topic_number": "Topic number"}, inplace=True, errors="ignore"
+            )
+            master_reference_df_state.drop(
+                columns=["1", "2", "3"], inplace=True, errors="ignore"
+            )
+        except Exception as e:
+            print("Could not rename Topic_number due to", e)
+        number_of_responses_with_topic_assignment = len(
+            master_reference_df_state["Response References"].unique()
+        )
+        reference_table_csv_path = output_folder + "reference_df_for_xlsx.csv"
+        master_reference_df_state.to_csv(reference_table_csv_path, index=None)
+        reference_pivot_table_csv_path = (
+            output_folder + "reference_pivot_df_for_xlsx.csv"
+        )
+        reference_pivot_table.to_csv(reference_pivot_table_csv_path, index=None)
+        short_file_name = os.path.basename(file_name)
+    if not master_unique_topics_df_state.empty:
+        master_unique_topics_df_state.drop(
+            columns=["1", "2", "3"], inplace=True, errors="ignore"
+        )
+        unique_topic_table_csv_path = (
+            output_folder + "unique_topic_table_df_for_xlsx.csv"
+        )
+        master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index=None)
+    if unique_topic_table_csv_path:
+        csv_files.append(unique_topic_table_csv_path)
+        sheet_names.append("Topic summary")
+        column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F": 100}
+        wrap_text_columns["Topic summary"] = ["B", "F"]
+    else:
+        print("Relevant unique topic files not found, excluding from xlsx output.")
+    if reference_table_csv_path:
+        if structured_summaries:
+            print(
+                "Structured summaries are being produced, excluding response level data from xlsx output."
+            )
+        else:
+            csv_files.append(reference_table_csv_path)
+            sheet_names.append("Response level data")
+            column_widths["Response level data"] = {"A": 15, "B": 30, "C": 40, "H": 100}
+            wrap_text_columns["Response level data"] = ["C", "G"]
+    else:
+        print("Relevant reference files not found, excluding from xlsx output.")
+    if reference_pivot_table_csv_path:
+        if structured_summaries:
+            print(
+                "Structured summaries are being produced, excluding topic response pivot table from xlsx output."
+            )
+        else:
+            csv_files.append(reference_pivot_table_csv_path)
+            sheet_names.append("Topic response pivot table")
+            if reference_pivot_table.empty:
+                reference_pivot_table = pd.read_csv(reference_pivot_table_csv_path)
+            # Base widths and wrap
+            column_widths["Topic response pivot table"] = {"A": 25, "B": 100}
+            wrap_text_columns["Topic response pivot table"] = ["B"]
+            num_cols = len(reference_pivot_table.columns)
+            col_letters = [get_column_letter(i) for i in range(3, num_cols + 1)]
+            for col_letter in col_letters:
+                column_widths["Topic response pivot table"][col_letter] = 25
+            wrap_text_columns["Topic response pivot table"].extend(col_letters)
+    else:
+        print(
+            "Relevant reference pivot table files not found, excluding from xlsx output."
+        )
+    if not missing_df_state.empty:
+        missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
+        missing_df_state.to_csv(missing_df_state_csv_path, index=None)
+    if missing_df_state_csv_path:
+        if structured_summaries:
+            print(
+                "Structured summaries are being produced, excluding missing responses from xlsx output."
+            )
+        else:
+            csv_files.append(missing_df_state_csv_path)
+            sheet_names.append("Missing responses")
+            column_widths["Missing responses"] = {"A": 25, "B": 30, "C": 50}
+            wrap_text_columns["Missing responses"] = ["C"]
+    else:
+        print("Relevant missing responses files not found, excluding from xlsx output.")
+    new_csv_files = csv_files.copy()
+    # Original data file
+    original_ext = os.path.splitext(original_data_file_path)[1].lower()
+    if original_ext == ".csv":
+        csv_files.append(original_data_file_path)
+    else:
+        # Read and convert to CSV
+        if original_ext == ".xlsx":
+            if excel_sheets:
+                df = pd.read_excel(original_data_file_path, sheet_name=excel_sheets)
+            else:
+                df = pd.read_excel(original_data_file_path)
+        elif original_ext == ".parquet":
+            df = pd.read_parquet(original_data_file_path)
+        else:
+            raise Exception(f"Unsupported file type for original data: {original_ext}")
+        # Save as CSV in output folder
+        original_data_csv_path = os.path.join(
+            output_folder,
+            os.path.splitext(os.path.basename(original_data_file_path))[0]
+            + "_for_xlsx.csv",
+        )
+        df.to_csv(original_data_csv_path, index=False)
+        csv_files.append(original_data_csv_path)
+    sheet_names.append("Original data")
+    column_widths["Original data"] = {"A": 20, "B": 20, "C": 20}
+    wrap_text_columns["Original data"] = ["C"]
+    if isinstance(chosen_cols, list) and chosen_cols:
+        chosen_cols = chosen_cols[0]
+    else:
+        chosen_cols = str(chosen_cols) if chosen_cols else ""
+    # Intro page text
+    intro_text = [
+        "This workbook contains outputs from the large language model topic analysis of open text data. Each sheet corresponds to a different CSV report included in the analysis.",
+        f"The file analysed was {short_file_name}, the column analysed was '{chosen_cols}' and the data was grouped by column '{group}'."
+        " Please contact the LLM Topic Modelling app administrator if you need any explanation on how to use the results."
+        "Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this analysis **need to be checked by a human** to check for harmful outputs, false information, and bias.",
+    ]
+    # Get values for number of rows, number of responses, and number of responses longer than five words
+    number_of_responses = basic_response_data.shape[0]
+    # number_of_responses_with_text = basic_response_data["Response"].str.strip().notnull().sum()
+    number_of_responses_with_text = (
+        basic_response_data["Response"].str.strip().notnull()
+        & (basic_response_data["Response"].str.split().str.len() >= 1)
+    ).sum()
+    number_of_responses_with_text_five_plus_words = (
+        basic_response_data["Response"].str.strip().notnull()
+        & (basic_response_data["Response"].str.split().str.len() >= 5)
+    ).sum()
+    # Get number of LLM calls, input and output tokens
+    if usage_logs_location:
+        try:
+            usage_logs = pd.read_csv(usage_logs_location)
+            relevant_logs = usage_logs.loc[
+                (
+                    usage_logs["Reference data file name"]
+                    == reference_data_file_name_textbox
+                )
+                & (
+                    usage_logs[
+                        "Large language model for topic extraction and summarisation"
+                    ]
+                    == model_choice
+                )
+                & (
+                    usage_logs[
+                        "Select the open text column of interest. In an Excel file, this shows columns across all sheets."
+                    ]
+                    == (
+                        chosen_cols[0]
+                        if isinstance(chosen_cols, list) and chosen_cols
+                        else chosen_cols
+                    )
+                ),
+                :,
+            ]
+            llm_call_number = sum(relevant_logs["Total LLM calls"].astype(int))
+            input_tokens = sum(relevant_logs["Total input tokens"].astype(int))
+            output_tokens = sum(relevant_logs["Total output tokens"].astype(int))
+            time_taken = sum(
+                relevant_logs["Estimated time taken (seconds)"].astype(float)
+            )
+        except Exception as e:
+            print("Could not obtain usage logs due to:", e)
+            usage_logs = pd.DataFrame()
+            llm_call_number = 0
+            input_tokens = 0
+            output_tokens = 0
+            time_taken = 0
+    else:
+        print("LLM call logs location not provided")
+        usage_logs = pd.DataFrame()
+        llm_call_number = 0
+        input_tokens = 0
+        output_tokens = 0
+        time_taken = 0
+    # Create short filename:
+    model_choice_clean_short = clean_column_name(
+        model_name_map[model_choice]["short_name"],
+        max_length=20,
+        front_characters=False,
+    )
+    # Extract first column name as string for cleaning and Excel output
+    chosen_col_str = (
+        chosen_cols[0]
+        if isinstance(chosen_cols, list) and chosen_cols
+        else str(chosen_cols) if chosen_cols else ""
+    )
+    in_column_cleaned = clean_column_name(chosen_col_str, max_length=20)
+    file_name_cleaned = clean_column_name(
+        file_name, max_length=20, front_characters=True
+    )
+    # Save outputs for each batch. If master file created, label file as master
+    file_path_details = (
+        f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}"
+    )
+    output_xlsx_filename = (
+        output_folder
+        + file_path_details
+        + ("_structured_summaries" if structured_summaries else "_topic_analysis")
+        + ".xlsx"
+    )
+    xlsx_output_filename = csvs_to_excel(
+        csv_files=csv_files,
+        output_filename=output_xlsx_filename,
+        sheet_names=sheet_names,
+        column_widths=column_widths,
+        wrap_text_columns=wrap_text_columns,
+        intro_text=intro_text,
+        model_name=model_choice,
+        analysis_date=today_date,
+        analysis_cost="",
+        llm_call_number=llm_call_number,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        time_taken=time_taken,
+        number_of_responses=number_of_responses,
+        number_of_responses_with_text=number_of_responses_with_text,
+        number_of_responses_with_text_five_plus_words=number_of_responses_with_text_five_plus_words,
+        column_name=chosen_col_str,
+        number_of_responses_with_topic_assignment=number_of_responses_with_topic_assignment,
+        file_name=short_file_name,
+        unique_reference_numbers=unique_reference_numbers,
+    )
+    xlsx_output_filenames = [xlsx_output_filename]
+    # Delete intermediate csv files
+    for csv_file in new_csv_files:
+        os.remove(csv_file)
+    return xlsx_output_filenames, xlsx_output_filenames

tools/config.py ADDED Viewed

	@@ -0,0 +1,950 @@

+import codecs
+import logging
+import os
+import socket
+import tempfile
+from datetime import datetime
+from typing import List
+from dotenv import load_dotenv
+today_rev = datetime.now().strftime("%Y%m%d")
+HOST_NAME = socket.gethostname()
+# Set or retrieve configuration variables for the redaction app
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+    """
+    Get an environmental variable, and set it to a default value if it doesn't exist
+    """
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set the environment variable to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    if print_val is True:
+        print(f"The value of {var_name} is {value}")
+    return value
+def add_folder_to_path(folder_path: str):
+    """
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+def convert_string_to_boolean(value: str) -> bool:
+    """Convert string to boolean, handling various formats."""
+    if isinstance(value, bool):
+        return value
+    elif value in ["True", "1", "true", "TRUE"]:
+        return True
+    elif value in ["False", "0", "false", "FALSE"]:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean value: {value}")
+###
+# LOAD CONFIG FROM ENV FILE
+###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var(
+    "APP_CONFIG_PATH", CONFIG_FOLDER + "app_config.env"
+)  # e.g. config/app_config.env
+if APP_CONFIG_PATH:
+    if os.path.exists(APP_CONFIG_PATH):
+        print(f"Loading app variables from config file {APP_CONFIG_PATH}")
+        load_dotenv(APP_CONFIG_PATH)
+    else:
+        print("App config file not found at location:", APP_CONFIG_PATH)
+###
+# AWS OPTIONS
+###
+# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var(
+    "AWS_CONFIG_PATH", ""
+)  # e.g. config/aws_config.env
+if AWS_CONFIG_PATH:
+    if os.path.exists(AWS_CONFIG_PATH):
+        print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
+        load_dotenv(AWS_CONFIG_PATH)
+    else:
+        print("AWS config file not found at location:", AWS_CONFIG_PATH)
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_CLIENT_ID = get_or_create_env_var("AWS_CLIENT_ID", "")
+AWS_CLIENT_SECRET = get_or_create_env_var("AWS_CLIENT_SECRET", "")
+AWS_USER_POOL_ID = get_or_create_env_var("AWS_USER_POOL_ID", "")
+AWS_ACCESS_KEY = get_or_create_env_var("AWS_ACCESS_KEY", "")
+# if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
+AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "")
+# if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
+# Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
+PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var(
+    "PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "1"
+)
+S3_LOG_BUCKET = get_or_create_env_var("S3_LOG_BUCKET", "")
+# Custom headers e.g. if routing traffic through Cloudfront
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var("CUSTOM_HEADER", "")
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "")
+###
+# File I/O
+###
+SESSION_OUTPUT_FOLDER = get_or_create_env_var(
+    "SESSION_OUTPUT_FOLDER", "False"
+)  # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/")  # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/")  # 'input/'
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Temporary directory created at: {temp_dir}")
+        if OUTPUT_FOLDER == "TEMP":
+            OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP":
+            INPUT_FOLDER = temp_dir + "/"
+GRADIO_TEMP_DIR = get_or_create_env_var(
+    "GRADIO_TEMP_DIR", "tmp/gradio_tmp/"
+)  # Default Gradio temp folder
+MPLCONFIGDIR = get_or_create_env_var(
+    "MPLCONFIGDIR", "tmp/matplotlib_cache/"
+)  # Matplotlib cache folder
+S3_OUTPUTS_BUCKET = get_or_create_env_var("S3_OUTPUTS_BUCKET", "")
+S3_OUTPUTS_FOLDER = get_or_create_env_var("S3_OUTPUTS_FOLDER", "")
+SAVE_OUTPUTS_TO_S3 = get_or_create_env_var("SAVE_OUTPUTS_TO_S3", "False")
+###
+# LOGGING OPTIONS
+###
+# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
+# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+USE_LOG_SUBFOLDERS = get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
+ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
+USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
+# Initialize full_log_subfolder based on USE_LOG_SUBFOLDERS setting
+if USE_LOG_SUBFOLDERS == "True":
+    day_log_subfolder = today_rev + "/"
+    host_name_subfolder = HOST_NAME + "/"
+    full_log_subfolder = day_log_subfolder + host_name_subfolder
+    FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
+    ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
+    USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
+else:
+    full_log_subfolder = ""  # Empty string when subfolders are not used
+S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var(
+    "S3_FEEDBACK_LOGS_FOLDER", "feedback/" + full_log_subfolder
+)
+S3_ACCESS_LOGS_FOLDER = get_or_create_env_var(
+    "S3_ACCESS_LOGS_FOLDER", "logs/" + full_log_subfolder
+)
+S3_USAGE_LOGS_FOLDER = get_or_create_env_var(
+    "S3_USAGE_LOGS_FOLDER", "usage/" + full_log_subfolder
+)
+LOG_FILE_NAME = get_or_create_env_var("LOG_FILE_NAME", "log.csv")
+USAGE_LOG_FILE_NAME = get_or_create_env_var("USAGE_LOG_FILE_NAME", LOG_FILE_NAME)
+FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FILE_NAME)
+# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
+DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var(
+    "DISPLAY_FILE_NAMES_IN_LOGS", "False"
+)
+# Further customisation options for CSV logs
+CSV_ACCESS_LOG_HEADERS = get_or_create_env_var(
+    "CSV_ACCESS_LOG_HEADERS", ""
+)  # If blank, uses component labels
+CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+    "CSV_FEEDBACK_LOG_HEADERS", ""
+)  # If blank, uses component labels
+CSV_USAGE_LOG_HEADERS = get_or_create_env_var(
+    "CSV_USAGE_LOG_HEADERS", ""
+)  # If blank, uses component labels
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "ACCESS_LOG_DYNAMODB_TABLE_NAME", "llm_topic_model_access_log"
+)
+DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var("DYNAMODB_ACCESS_LOG_HEADERS", "")
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", "llm_topic_model_feedback"
+)
+DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+    "DYNAMODB_FEEDBACK_LOG_HEADERS", ""
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "USAGE_LOG_DYNAMODB_TABLE_NAME", "llm_topic_model_usage"
+)
+DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
+# Report logging to console?
+LOGGING = get_or_create_env_var("LOGGING", "False")
+if LOGGING == "True":
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+###
+# App run variables
+###
+OUTPUT_DEBUG_FILES = get_or_create_env_var(
+    "OUTPUT_DEBUG_FILES", "False"
+)  # Whether to output debug files
+SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES = get_or_create_env_var(
+    "SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES", "True"
+)  # Whether to show additional instruction textboxes in the GUI
+TIMEOUT_WAIT = int(
+    get_or_create_env_var("TIMEOUT_WAIT", "30")
+)  # Maximum number of seconds to wait for a response from the LLM
+NUMBER_OF_RETRY_ATTEMPTS = int(
+    get_or_create_env_var("NUMBER_OF_RETRY_ATTEMPTS", "5")
+)  # Maximum number of times to retry a request to the LLM
+# Try up to 3 times to get a valid markdown table response with LLM calls, otherwise retry with temperature changed
+MAX_OUTPUT_VALIDATION_ATTEMPTS = int(
+    get_or_create_env_var("MAX_OUTPUT_VALIDATION_ATTEMPTS", "3")
+)
+ENABLE_VALIDATION = get_or_create_env_var(
+    "ENABLE_VALIDATION", "False"
+)  # Whether to run validation loop after initial topic extraction
+MAX_TIME_FOR_LOOP = int(
+    get_or_create_env_var("MAX_TIME_FOR_LOOP", "99999")
+)  # Maximum number of seconds to run the loop for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there)
+MAX_COMMENT_CHARS = int(
+    get_or_create_env_var("MAX_COMMENT_CHARS", "14000")
+)  # Maximum number of characters in a comment
+MAX_ROWS = int(
+    get_or_create_env_var("MAX_ROWS", "5000")
+)  # Maximum number of rows to process
+MAX_GROUPS = int(
+    get_or_create_env_var("MAX_GROUPS", "99")
+)  # Maximum number of groups to process
+BATCH_SIZE_DEFAULT = int(
+    get_or_create_env_var("BATCH_SIZE_DEFAULT", "5")
+)  # Default batch size for LLM calls
+MAXIMUM_ZERO_SHOT_TOPICS = int(
+    get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "120")
+)  # Maximum number of zero shot topics to process
+MAX_SPACES_GPU_RUN_TIME = int(
+    get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "240")
+)  # Maximum number of seconds to run on GPU on Hugging Face Spaces
+DEDUPLICATION_THRESHOLD = int(
+    get_or_create_env_var("DEDUPLICATION_THRESHOLD", "90")
+)  # Deduplication threshold for topic summary tables
+###
+# Model options
+###
+RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "0")
+RUN_AWS_BEDROCK_MODELS = get_or_create_env_var("RUN_AWS_BEDROCK_MODELS", "1")
+RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
+GEMINI_API_KEY = get_or_create_env_var("GEMINI_API_KEY", "")
+INTRO_TEXT = get_or_create_env_var(
+    "INTRO_TEXT",
+    """# Large language model topic modelling
+Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure/OpenAI, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets below. API keys for AWS, Azure/OpenAI, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
+NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""",
+)
+# Read in intro text from a text file if it is a path to a text file
+if INTRO_TEXT.endswith(".txt"):
+    INTRO_TEXT = open(INTRO_TEXT, "r").read()
+INTRO_TEXT = INTRO_TEXT.strip('"').strip("'")
+# Azure/OpenAI AI Inference settings
+RUN_AZURE_MODELS = get_or_create_env_var("RUN_AZURE_MODELS", "1")
+AZURE_OPENAI_API_KEY = get_or_create_env_var("AZURE_OPENAI_API_KEY", "")
+AZURE_OPENAI_INFERENCE_ENDPOINT = get_or_create_env_var(
+    "AZURE_OPENAI_INFERENCE_ENDPOINT", ""
+)
+# Llama-server settings
+RUN_INFERENCE_SERVER = get_or_create_env_var("RUN_INFERENCE_SERVER", "0")
+API_URL = get_or_create_env_var("API_URL", "http://localhost:8080")
+RUN_MCP_SERVER = convert_string_to_boolean(
+    get_or_create_env_var("RUN_MCP_SERVER", "False")
+)
+# Build up options for models
+model_full_names = list()
+model_short_names = list()
+model_source = list()
+CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var(
+    "CHOSEN_LOCAL_MODEL_TYPE", "Qwen 3 4B"
+)  # Gemma 3 1B #  "Gemma 2b" # "Gemma 3 4B"
+USE_LLAMA_SWAP = get_or_create_env_var("USE_LLAMA_SWAP", "False")
+if USE_LLAMA_SWAP == "True":
+    USE_LLAMA_SWAP = True
+else:
+    USE_LLAMA_SWAP = False
+if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
+    model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
+    model_short_names.append(CHOSEN_LOCAL_MODEL_TYPE)
+    model_source.append("Local")
+if RUN_AWS_BEDROCK_MODELS == "1":
+    amazon_models = [
+        "anthropic.claude-3-haiku-20240307-v1:0",
+        "anthropic.claude-3-7-sonnet-20250219-v1:0",
+        "anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "amazon.nova-micro-v1:0",
+        "amazon.nova-lite-v1:0",
+        "amazon.nova-pro-v1:0",
+        "deepseek.v3-v1:0",
+        "openai.gpt-oss-20b-1:0",
+        "openai.gpt-oss-120b-1:0",
+        "google.gemma-3-12b-it",
+        "mistral.ministral-3-14b-instruct",
+    ]
+    model_full_names.extend(amazon_models)
+    model_short_names.extend(
+        [
+            "haiku",
+            "sonnet_3_7",
+            "sonnet_4_5",
+            "nova_micro",
+            "nova_lite",
+            "nova_pro",
+            "deepseek_v3",
+            "gpt_oss_20b_aws",
+            "gpt_oss_120b_aws",
+            "gemma_3_12b_it",
+            "ministral_3_14b_instruct",
+        ]
+    )
+    model_source.extend(["AWS"] * len(amazon_models))
+if RUN_GEMINI_MODELS == "1":
+    gemini_models = ["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro"]
+    model_full_names.extend(gemini_models)
+    model_short_names.extend(
+        ["gemini_flash_lite_2.5", "gemini_flash_2.5", "gemini_pro"]
+    )
+    model_source.extend(["Gemini"] * len(gemini_models))
+# Register Azure/OpenAI AI models (model names must match your Azure/OpenAI deployments)
+if RUN_AZURE_MODELS == "1":
+    # Example deployments; adjust to the deployments you actually create in Azure/OpenAI
+    azure_models = ["gpt-5-mini", "gpt-4o-mini"]
+    model_full_names.extend(azure_models)
+    model_short_names.extend(["gpt-5-mini", "gpt-4o-mini"])
+    model_source.extend(["Azure/OpenAI"] * len(azure_models))
+# Register inference-server models
+CHOSEN_INFERENCE_SERVER_MODEL = ""
+if RUN_INFERENCE_SERVER == "1":
+    # Example inference-server models; adjust to the models you have available on your server
+    inference_server_models = [
+        "unnamed-inference-server-model",
+        "qwen_3_4b_it",
+        "qwen_3_4b_think",
+        "gpt_oss_20b",
+        "gemma_3_12b",
+        "ministral_3_14b_it",
+    ]
+    model_full_names.extend(inference_server_models)
+    model_short_names.extend(inference_server_models)
+    model_source.extend(["inference-server"] * len(inference_server_models))
+    CHOSEN_INFERENCE_SERVER_MODEL = get_or_create_env_var(
+        "CHOSEN_INFERENCE_SERVER_MODEL", inference_server_models[0]
+    )
+    if CHOSEN_INFERENCE_SERVER_MODEL not in inference_server_models:
+        model_full_names.append(CHOSEN_INFERENCE_SERVER_MODEL)
+        model_short_names.append(CHOSEN_INFERENCE_SERVER_MODEL)
+        model_source.append("inference-server")
+model_name_map = {
+    full: {"short_name": short, "source": source}
+    for full, short, source in zip(model_full_names, model_short_names, model_source)
+}
+if RUN_LOCAL_MODEL == "1":
+    default_model_choice = CHOSEN_LOCAL_MODEL_TYPE
+elif RUN_INFERENCE_SERVER == "1":
+    default_model_choice = CHOSEN_INFERENCE_SERVER_MODEL
+elif RUN_AWS_FUNCTIONS == "1":
+    default_model_choice = amazon_models[0]
+else:
+    default_model_choice = gemini_models[0]
+default_model_source = model_name_map[default_model_choice]["source"]
+model_sources = list(
+    set([model_name_map[model]["source"] for model in model_full_names])
+)
+def update_model_choice_config(default_model_source, model_name_map):
+    # Filter models by source and return the first matching model name
+    matching_models = [
+        model_name
+        for model_name, model_info in model_name_map.items()
+        if model_info["source"] == default_model_source
+    ]
+    output_model = matching_models[0] if matching_models else model_full_names[0]
+    return output_model, matching_models
+default_model_choice, default_source_models = update_model_choice_config(
+    default_model_source, model_name_map
+)
+# print("model_name_map:", model_name_map)
+# HF token may or may not be needed for downloading models from Hugging Face
+HF_TOKEN = get_or_create_env_var("HF_TOKEN", "")
+LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var("LOAD_LOCAL_MODEL_AT_START", "False")
+# If you are using a system with low VRAM, you can set this to True to reduce the memory requirements
+LOW_VRAM_SYSTEM = get_or_create_env_var("LOW_VRAM_SYSTEM", "False")
+MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var("MULTIMODAL_PROMPT_FORMAT", "False")
+if LOW_VRAM_SYSTEM == "True":
+    print("Using settings for low VRAM system")
+    USE_LLAMA_CPP = get_or_create_env_var("USE_LLAMA_CPP", "True")
+    LLM_MAX_NEW_TOKENS = int(get_or_create_env_var("LLM_MAX_NEW_TOKENS", "4096"))
+    LLM_CONTEXT_LENGTH = int(get_or_create_env_var("LLM_CONTEXT_LENGTH", "16384"))
+    LLM_BATCH_SIZE = int(get_or_create_env_var("LLM_BATCH_SIZE", "512"))
+    K_QUANT_LEVEL = int(
+        get_or_create_env_var("K_QUANT_LEVEL", "2")
+    )  # 2 = q4_0, 8 = q8_0, 4 = fp16
+    V_QUANT_LEVEL = int(
+        get_or_create_env_var("V_QUANT_LEVEL", "2")
+    )  # 2 = q4_0, 8 = q8_0, 4 = fp16
+USE_LLAMA_CPP = get_or_create_env_var(
+    "USE_LLAMA_CPP", "True"
+)  # Llama.cpp or transformers with unsloth
+LOCAL_REPO_ID = get_or_create_env_var("LOCAL_REPO_ID", "")
+LOCAL_MODEL_FILE = get_or_create_env_var("LOCAL_MODEL_FILE", "")
+LOCAL_MODEL_FOLDER = get_or_create_env_var("LOCAL_MODEL_FOLDER", "")
+GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
+GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
+GEMMA2_MODEL_FILE = get_or_create_env_var(
+    "GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf"
+)
+GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
+GEMMA3_4B_REPO_ID = get_or_create_env_var(
+    "GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF"
+)
+GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
+GEMMA3_4B_MODEL_FILE = get_or_create_env_var(
+    "GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-UD-Q4_K_XL.gguf"
+)
+GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var(
+    "GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b"
+)
+GEMMA3_12B_REPO_ID = get_or_create_env_var(
+    "GEMMA3_12B_REPO_ID", "unsloth/gemma-3-12b-it-GGUF"
+)
+GEMMA3_12B_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GEMMA3_12B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-12b-it-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    GEMMA3_12B_REPO_ID = GEMMA3_12B_REPO_TRANSFORMERS_ID
+GEMMA3_12B_MODEL_FILE = get_or_create_env_var(
+    "GEMMA3_12B_MODEL_FILE", "gemma-3-12b-it-UD-Q4_K_XL.gguf"
+)
+GEMMA3_12B_MODEL_FOLDER = get_or_create_env_var(
+    "GEMMA3_12B_MODEL_FOLDER", "model/gemma3_12b"
+)
+GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
+GPT_OSS_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GPT_OSS_REPO_TRANSFORMERS_ID", "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    GPT_OSS_REPO_ID = GPT_OSS_REPO_TRANSFORMERS_ID
+GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
+GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
+QWEN3_4B_REPO_ID = get_or_create_env_var(
+    "QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF"
+)
+QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
+QWEN3_4B_MODEL_FILE = get_or_create_env_var(
+    "QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf"
+)
+QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
+GRANITE_4_TINY_REPO_ID = get_or_create_env_var(
+    "GRANITE_4_TINY_REPO_ID", "unsloth/granite-4.0-h-tiny-GGUF"
+)
+GRANITE_4_TINY_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GRANITE_4_TINY_REPO_TRANSFORMERS_ID", "unsloth/granite-4.0-h-tiny-FP8-Dynamic"
+)
+if USE_LLAMA_CPP == "False":
+    GRANITE_4_TINY_REPO_ID = GRANITE_4_TINY_REPO_TRANSFORMERS_ID
+GRANITE_4_TINY_MODEL_FILE = get_or_create_env_var(
+    "GRANITE_4_TINY_MODEL_FILE", "granite-4.0-h-tiny-UD-Q4_K_XL.gguf"
+)
+GRANITE_4_TINY_MODEL_FOLDER = get_or_create_env_var(
+    "GRANITE_4_TINY_MODEL_FOLDER", "model/granite"
+)
+GRANITE_4_3B_REPO_ID = get_or_create_env_var(
+    "GRANITE_4_3B_REPO_ID", "unsloth/granite-4.0-h-micro-GGUF"
+)
+GRANITE_4_3B_REPO_TRANSFORMERS_ID = get_or_create_env_var(
+    "GRANITE_4_3B_REPO_TRANSFORMERS_ID", "unsloth/granite-4.0-micro-unsloth-bnb-4bit"
+)
+if USE_LLAMA_CPP == "False":
+    GRANITE_4_3B_REPO_ID = GRANITE_4_3B_REPO_TRANSFORMERS_ID
+GRANITE_4_3B_MODEL_FILE = get_or_create_env_var(
+    "GRANITE_4_3B_MODEL_FILE", "granite-4.0-h-micro-UD-Q4_K_XL.gguf"
+)
+GRANITE_4_3B_MODEL_FOLDER = get_or_create_env_var(
+    "GRANITE_4_3B_MODEL_FOLDER", "model/granite"
+)
+if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
+    LOCAL_REPO_ID = GEMMA2_REPO_ID
+    LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GEMMA2_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
+    LOCAL_REPO_ID = GEMMA3_4B_REPO_ID
+    LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GEMMA3_4B_MODEL_FOLDER
+    MULTIMODAL_PROMPT_FORMAT = "True"
+elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 12B":
+    LOCAL_REPO_ID = GEMMA3_12B_REPO_ID
+    LOCAL_MODEL_FILE = GEMMA3_12B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GEMMA3_12B_MODEL_FOLDER
+    MULTIMODAL_PROMPT_FORMAT = "True"
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B":
+    LOCAL_REPO_ID = QWEN3_4B_REPO_ID
+    LOCAL_MODEL_FILE = QWEN3_4B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = QWEN3_4B_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
+    LOCAL_REPO_ID = GPT_OSS_REPO_ID
+    LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 Tiny":
+    LOCAL_REPO_ID = GRANITE_4_TINY_REPO_ID
+    LOCAL_MODEL_FILE = GRANITE_4_TINY_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GRANITE_4_TINY_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 Micro":
+    LOCAL_REPO_ID = GRANITE_4_3B_REPO_ID
+    LOCAL_MODEL_FILE = GRANITE_4_3B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GRANITE_4_3B_MODEL_FOLDER
+elif not CHOSEN_LOCAL_MODEL_TYPE:
+    print("No local model type chosen")
+    LOCAL_REPO_ID = ""
+    LOCAL_MODEL_FILE = ""
+    LOCAL_MODEL_FOLDER = ""
+else:
+    print("CHOSEN_LOCAL_MODEL_TYPE not found")
+    LOCAL_REPO_ID = ""
+    LOCAL_MODEL_FILE = ""
+    LOCAL_MODEL_FOLDER = ""
+USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
+ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "")
+if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
+    ASSISTANT_MODEL = get_or_create_env_var(
+        "ASSISTANT_MODEL", "unsloth/gemma-3-270m-it"
+    )
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B":
+    ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
+DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
+GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var(
+    "GEMMA3_DRAFT_MODEL_LOC",
+    DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf",
+)
+GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var(
+    "GEMMA3_4B_DRAFT_MODEL_LOC",
+    DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf",
+)
+QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var(
+    "QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf"
+)
+QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var(
+    "QWEN3_4B_DRAFT_MODEL_LOC",
+    DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf",
+)
+LLM_MAX_GPU_LAYERS = int(
+    get_or_create_env_var("LLM_MAX_GPU_LAYERS", "-1")
+)  # Maximum possible
+LLM_TEMPERATURE = float(get_or_create_env_var("LLM_TEMPERATURE", "0.6"))
+LLM_TOP_K = int(
+    get_or_create_env_var("LLM_TOP_K", "64")
+)  # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
+LLM_MIN_P = float(get_or_create_env_var("LLM_MIN_P", "0"))
+LLM_TOP_P = float(get_or_create_env_var("LLM_TOP_P", "0.95"))
+LLM_REPETITION_PENALTY = float(get_or_create_env_var("LLM_REPETITION_PENALTY", "1.0"))
+LLM_LAST_N_TOKENS = int(get_or_create_env_var("LLM_LAST_N_TOKENS", "512"))
+LLM_MAX_NEW_TOKENS = int(get_or_create_env_var("LLM_MAX_NEW_TOKENS", "4096"))
+LLM_SEED = int(get_or_create_env_var("LLM_SEED", "42"))
+LLM_RESET = get_or_create_env_var("LLM_RESET", "False")
+LLM_STREAM = get_or_create_env_var("LLM_STREAM", "True")
+LLM_THREADS = int(get_or_create_env_var("LLM_THREADS", "-1"))
+LLM_BATCH_SIZE = int(get_or_create_env_var("LLM_BATCH_SIZE", "2048"))
+LLM_CONTEXT_LENGTH = int(get_or_create_env_var("LLM_CONTEXT_LENGTH", "24576"))
+LLM_SAMPLE = get_or_create_env_var("LLM_SAMPLE", "True")
+LLM_STOP_STRINGS = get_or_create_env_var("LLM_STOP_STRINGS", r"['\n\n\n\n\n\n']")
+SPECULATIVE_DECODING = get_or_create_env_var("SPECULATIVE_DECODING", "False")
+NUM_PRED_TOKENS = int(get_or_create_env_var("NUM_PRED_TOKENS", "2"))
+K_QUANT_LEVEL = get_or_create_env_var(
+    "K_QUANT_LEVEL", ""
+)  # 2 = q4_0, 8 = q8_0, 4 = fp16
+V_QUANT_LEVEL = get_or_create_env_var(
+    "V_QUANT_LEVEL", ""
+)  # 2 = q4_0, 8 = q8_0, 4 = fp16
+if not K_QUANT_LEVEL:
+    K_QUANT_LEVEL = None
+else:
+    K_QUANT_LEVEL = int(K_QUANT_LEVEL)
+if not V_QUANT_LEVEL:
+    V_QUANT_LEVEL = None
+else:
+    V_QUANT_LEVEL = int(V_QUANT_LEVEL)
+# If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level, or turn it off in the case of Qwen 3 4B
+if CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
+    REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "Reasoning: low")
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B" and USE_LLAMA_CPP == "False":
+    REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "/nothink")
+else:
+    REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "")
+# Transformers variables
+COMPILE_TRANSFORMERS = get_or_create_env_var(
+    "COMPILE_TRANSFORMERS", "False"
+)  # Whether to compile transformers models
+USE_BITSANDBYTES = get_or_create_env_var(
+    "USE_BITSANDBYTES", "True"
+)  # Whether to use bitsandbytes for quantization
+COMPILE_MODE = get_or_create_env_var(
+    "COMPILE_MODE", "reduce-overhead"
+)  # alternatively 'max-autotune'
+MODEL_DTYPE = get_or_create_env_var(
+    "MODEL_DTYPE", "bfloat16"
+)  # alternatively 'bfloat16'
+INT8_WITH_OFFLOAD_TO_CPU = get_or_create_env_var(
+    "INT8_WITH_OFFLOAD_TO_CPU", "False"
+)  # Whether to offload to CPU
+DEFAULT_SAMPLED_SUMMARIES = int(
+    get_or_create_env_var("DEFAULT_SAMPLED_SUMMARIES", "75")
+)
+###
+# Gradio app variables
+###
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
+# Direct mode environment variables
+DIRECT_MODE_TASK = get_or_create_env_var("DIRECT_MODE_TASK", "extract")
+DIRECT_MODE_INPUT_FILE = get_or_create_env_var("DIRECT_MODE_INPUT_FILE", "")
+DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var("DIRECT_MODE_OUTPUT_DIR", OUTPUT_FOLDER)
+DIRECT_MODE_S3_OUTPUT_BUCKET = get_or_create_env_var(
+    "DIRECT_MODE_S3_OUTPUT_BUCKET", S3_OUTPUTS_BUCKET
+)
+DIRECT_MODE_TEXT_COLUMN = get_or_create_env_var("DIRECT_MODE_TEXT_COLUMN", "")
+DIRECT_MODE_PREVIOUS_OUTPUT_FILES = get_or_create_env_var(
+    "DIRECT_MODE_PREVIOUS_OUTPUT_FILES", ""
+)
+DIRECT_MODE_USERNAME = get_or_create_env_var("DIRECT_MODE_USERNAME", "")
+DIRECT_MODE_GROUP_BY = get_or_create_env_var("DIRECT_MODE_GROUP_BY", "")
+DIRECT_MODE_EXCEL_SHEETS = get_or_create_env_var("DIRECT_MODE_EXCEL_SHEETS", "")
+DIRECT_MODE_MODEL_CHOICE = get_or_create_env_var(
+    "DIRECT_MODE_MODEL_CHOICE", default_model_choice
+)
+DIRECT_MODE_TEMPERATURE = get_or_create_env_var(
+    "DIRECT_MODE_TEMPERATURE", str(LLM_TEMPERATURE)
+)
+DIRECT_MODE_BATCH_SIZE = get_or_create_env_var(
+    "DIRECT_MODE_BATCH_SIZE", str(BATCH_SIZE_DEFAULT)
+)
+DIRECT_MODE_MAX_TOKENS = get_or_create_env_var(
+    "DIRECT_MODE_MAX_TOKENS", str(LLM_MAX_NEW_TOKENS)
+)
+DIRECT_MODE_CONTEXT = get_or_create_env_var("DIRECT_MODE_CONTEXT", "")
+DIRECT_MODE_CANDIDATE_TOPICS = get_or_create_env_var("DIRECT_MODE_CANDIDATE_TOPICS", "")
+DIRECT_MODE_FORCE_ZERO_SHOT = get_or_create_env_var("DIRECT_MODE_FORCE_ZERO_SHOT", "No")
+DIRECT_MODE_FORCE_SINGLE_TOPIC = get_or_create_env_var(
+    "DIRECT_MODE_FORCE_SINGLE_TOPIC", "No"
+)
+DIRECT_MODE_PRODUCE_STRUCTURED_SUMMARY = get_or_create_env_var(
+    "DIRECT_MODE_PRODUCE_STRUCTURED_SUMMARY", "No"
+)
+DIRECT_MODE_SENTIMENT = get_or_create_env_var(
+    "DIRECT_MODE_SENTIMENT", "Negative or Positive"
+)
+DIRECT_MODE_ADDITIONAL_SUMMARY_INSTRUCTIONS = get_or_create_env_var(
+    "DIRECT_MODE_ADDITIONAL_SUMMARY_INSTRUCTIONS", ""
+)
+DIRECT_MODE_ADDITIONAL_VALIDATION_ISSUES = get_or_create_env_var(
+    "DIRECT_MODE_ADDITIONAL_VALIDATION_ISSUES", ""
+)
+DIRECT_MODE_SHOW_PREVIOUS_TABLE = get_or_create_env_var(
+    "DIRECT_MODE_SHOW_PREVIOUS_TABLE", "Yes"
+)
+DIRECT_MODE_MAX_TIME_FOR_LOOP = get_or_create_env_var(
+    "DIRECT_MODE_MAX_TIME_FOR_LOOP", str(MAX_TIME_FOR_LOOP)
+)
+DIRECT_MODE_DEDUP_METHOD = get_or_create_env_var("DIRECT_MODE_DEDUP_METHOD", "fuzzy")
+DIRECT_MODE_SIMILARITY_THRESHOLD = get_or_create_env_var(
+    "DIRECT_MODE_SIMILARITY_THRESHOLD", str(DEDUPLICATION_THRESHOLD)
+)
+DIRECT_MODE_MERGE_SENTIMENT = get_or_create_env_var("DIRECT_MODE_MERGE_SENTIMENT", "No")
+DIRECT_MODE_MERGE_GENERAL_TOPICS = get_or_create_env_var(
+    "DIRECT_MODE_MERGE_GENERAL_TOPICS", "Yes"
+)
+DIRECT_MODE_SUMMARY_FORMAT = get_or_create_env_var(
+    "DIRECT_MODE_SUMMARY_FORMAT", "two_paragraph"
+)
+DIRECT_MODE_SAMPLE_REFERENCE_TABLE = get_or_create_env_var(
+    "DIRECT_MODE_SAMPLE_REFERENCE_TABLE", "True"
+)
+DIRECT_MODE_NO_OF_SAMPLED_SUMMARIES = get_or_create_env_var(
+    "DIRECT_MODE_NO_OF_SAMPLED_SUMMARIES", str(DEFAULT_SAMPLED_SUMMARIES)
+)
+DIRECT_MODE_RANDOM_SEED = get_or_create_env_var(
+    "DIRECT_MODE_RANDOM_SEED", str(LLM_SEED)
+)
+DIRECT_MODE_CREATE_XLSX_OUTPUT = get_or_create_env_var(
+    "DIRECT_MODE_CREATE_XLSX_OUTPUT", "True"
+)
+# CHOSEN_INFERENCE_SERVER_MODEL is defined later, so we'll handle it after that definition
+MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
+MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb")
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+ROOT_PATH = get_or_create_env_var("ROOT_PATH", "")
+DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var("DEFAULT_CONCURRENCY_LIMIT", "3")
+GET_DEFAULT_ALLOW_LIST = get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "")
+ALLOW_LIST_PATH = get_or_create_env_var(
+    "ALLOW_LIST_PATH", ""
+)  # config/default_allow_list.csv
+S3_ALLOW_LIST_PATH = get_or_create_env_var(
+    "S3_ALLOW_LIST_PATH", ""
+)  # default_allow_list.csv # This is a path within the named S3 bucket
+if ALLOW_LIST_PATH:
+    OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
+else:
+    OUTPUT_ALLOW_LIST_PATH = "config/default_allow_list.csv"
+FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "125"))
+SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "True")
+###
+# COST CODE OPTIONS
+###
+SHOW_COSTS = get_or_create_env_var("SHOW_COSTS", "False")
+GET_COST_CODES = get_or_create_env_var("GET_COST_CODES", "False")
+DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
+COST_CODES_PATH = get_or_create_env_var(
+    "COST_CODES_PATH", ""
+)  # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
+S3_COST_CODES_PATH = get_or_create_env_var(
+    "S3_COST_CODES_PATH", ""
+)  # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+# A default path in case s3 cost code location is provided but no local cost code location given
+if COST_CODES_PATH:
+    OUTPUT_COST_CODES_PATH = COST_CODES_PATH
+else:
+    OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
+ENFORCE_COST_CODES = get_or_create_env_var(
+    "ENFORCE_COST_CODES", "False"
+)  # If you have cost codes listed, is it compulsory to choose one before redacting?
+if ENFORCE_COST_CODES == "True":
+    GET_COST_CODES = "True"
+###
+# VALIDATE FOLDERS AND CONFIG OPTIONS
+###
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        pass
+        # print(f"The {output_folder} folder already exists.")
+def _get_env_list(env_var_name: str, strip_strings: bool = True) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    if strip_strings:
+        return [s.strip() for s in value.split(",") if s.strip()]
+    else:
+        return [codecs.decode(s, "unicode_escape") for s in value.split(",") if s]
+# Convert string environment variables to string or list
+if SAVE_LOGS_TO_CSV == "True":
+    SAVE_LOGS_TO_CSV = True
+else:
+    SAVE_LOGS_TO_CSV = False
+if SAVE_LOGS_TO_DYNAMODB == "True":
+    SAVE_LOGS_TO_DYNAMODB = True
+else:
+    SAVE_LOGS_TO_DYNAMODB = False
+if CSV_ACCESS_LOG_HEADERS:
+    CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
+if CSV_FEEDBACK_LOG_HEADERS:
+    CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
+if CSV_USAGE_LOG_HEADERS:
+    CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
+if DYNAMODB_ACCESS_LOG_HEADERS:
+    DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
+if DYNAMODB_FEEDBACK_LOG_HEADERS:
+    DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
+if DYNAMODB_USAGE_LOG_HEADERS:
+    DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
+# Set DIRECT_MODE_INFERENCE_SERVER_MODEL after CHOSEN_INFERENCE_SERVER_MODEL is defined
+DIRECT_MODE_INFERENCE_SERVER_MODEL = get_or_create_env_var(
+    "DIRECT_MODE_INFERENCE_SERVER_MODEL",
+    CHOSEN_INFERENCE_SERVER_MODEL if CHOSEN_INFERENCE_SERVER_MODEL else "",
+)

tools/custom_csvlogger.py ADDED Viewed

	@@ -0,0 +1,333 @@

+from __future__ import annotations
+import csv
+import os
+import re
+import time
+import uuid
+from collections.abc import Sequence
+from datetime import datetime
+# from multiprocessing import Lock
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import boto3
+import botocore
+from gradio import utils
+from gradio_client import utils as client_utils
+from tools.config import AWS_ACCESS_KEY, AWS_REGION, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
+if TYPE_CHECKING:
+    from gradio.components import Component
+from threading import Lock
+from gradio.flagging import FlaggingCallback
+class CSVLogger_custom(FlaggingCallback):
+    """
+    The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
+    sample (both the input and output data) is logged to a CSV file with headers on the machine running
+    the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
+    dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
+    only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
+    Example:
+        import gradio as gr
+        def image_classifier(inp):
+            return {'cat': 0.3, 'dog': 0.7}
+        demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
+                            flagging_callback=CSVLogger())
+    Guides: using-flagging
+    """
+    def __init__(
+        self,
+        simplify_file_data: bool = True,
+        verbose: bool = True,
+        dataset_file_name: str | None = None,
+    ):
+        """
+        Parameters:
+            simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
+            verbose: If True, prints messages to the console about the dataset file creation
+            dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
+        """
+        self.simplify_file_data = simplify_file_data
+        self.verbose = verbose
+        self.dataset_file_name = dataset_file_name
+        self.lock = Lock()
+    def setup(
+        self,
+        components: Sequence[Component],
+        flagging_dir: str | Path,
+    ):
+        self.components = components
+        self.flagging_dir = Path(flagging_dir)
+        self.first_time = True
+    def _create_dataset_file(
+        self,
+        additional_headers: list[str] | None = None,
+        replacement_headers: list[str] | None = None,
+    ):
+        os.makedirs(self.flagging_dir, exist_ok=True)
+        if replacement_headers:
+            if additional_headers is None:
+                additional_headers = []
+            if len(replacement_headers) != len(self.components):
+                raise ValueError(
+                    f"replacement_headers must have the same length as components "
+                    f"({len(replacement_headers)} provided, {len(self.components)} expected)"
+                )
+            headers = replacement_headers + additional_headers + ["timestamp"]
+        else:
+            if additional_headers is None:
+                additional_headers = []
+            headers = (
+                [
+                    getattr(component, "label", None) or f"component {idx}"
+                    for idx, component in enumerate(self.components)
+                ]
+                + additional_headers
+                + ["timestamp"]
+            )
+        headers = utils.sanitize_list_for_csv(headers)
+        dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
+        if self.dataset_file_name:
+            self.dataset_filepath = self.flagging_dir / self.dataset_file_name
+        elif dataset_files:
+            try:
+                latest_file = max(
+                    dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
+                )
+                latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
+                with open(latest_file, newline="", encoding="utf-8-sig") as csvfile:
+                    reader = csv.reader(csvfile)
+                    existing_headers = next(reader, None)
+                if existing_headers != headers:
+                    new_num = latest_num + 1
+                    self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
+                else:
+                    self.dataset_filepath = latest_file
+            except Exception:
+                self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+        else:
+            self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+        if not Path(self.dataset_filepath).exists():
+            with open(
+                self.dataset_filepath, "w", newline="", encoding="utf-8-sig"
+            ) as csvfile:
+                writer = csv.writer(csvfile)
+                writer.writerow(utils.sanitize_list_for_csv(headers))
+            if self.verbose:
+                print("Created dataset file at:", self.dataset_filepath)
+        elif self.verbose:
+            print("Using existing dataset file at:", self.dataset_filepath)
+    def flag(
+        self,
+        flag_data: list[Any],
+        flag_option: str | None = None,
+        username: str | None = None,
+        save_to_csv: bool = True,
+        save_to_dynamodb: bool = False,
+        dynamodb_table_name: str | None = None,
+        dynamodb_headers: list[str] | None = None,  # New: specify headers for DynamoDB
+        replacement_headers: list[str] | None = None,
+    ) -> int:
+        if self.first_time:
+            # print("First time creating log file")
+            additional_headers = []
+            if flag_option is not None:
+                additional_headers.append("flag")
+            if username is not None:
+                additional_headers.append("username")
+            additional_headers.append("id")
+            # additional_headers.append("timestamp")
+            self._create_dataset_file(
+                additional_headers=additional_headers,
+                replacement_headers=replacement_headers,
+            )
+            self.first_time = False
+        csv_data = []
+        for idx, (component, sample) in enumerate(
+            zip(self.components, flag_data, strict=False)
+        ):
+            save_dir = (
+                self.flagging_dir
+                / client_utils.strip_invalid_filename_characters(
+                    getattr(component, "label", None) or f"component {idx}"
+                )
+            )
+            if utils.is_prop_update(sample):
+                csv_data.append(str(sample))
+            else:
+                data = (
+                    component.flag(sample, flag_dir=save_dir)
+                    if sample is not None
+                    else ""
+                )
+                if self.simplify_file_data:
+                    data = utils.simplify_file_data_in_str(data)
+                csv_data.append(data)
+        if flag_option is not None:
+            csv_data.append(flag_option)
+        if username is not None:
+            csv_data.append(username)
+        generated_id = str(uuid.uuid4())
+        csv_data.append(generated_id)
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
+            :-3
+        ]  # Correct format for Amazon Athena
+        csv_data.append(timestamp)
+        # Build the headers
+        headers = [
+            getattr(component, "label", None) or f"component {idx}"
+            for idx, component in enumerate(self.components)
+        ]
+        if flag_option is not None:
+            headers.append("flag")
+        if username is not None:
+            headers.append("username")
+        headers.append("id")
+        headers.append("timestamp")
+        line_count = -1
+        if save_to_csv:
+            with self.lock:
+                with open(
+                    self.dataset_filepath, "a", newline="", encoding="utf-8-sig"
+                ) as csvfile:
+                    writer = csv.writer(csvfile)
+                    writer.writerow(utils.sanitize_list_for_csv(csv_data))
+                with open(self.dataset_filepath, encoding="utf-8-sig") as csvfile:
+                    line_count = len(list(csv.reader(csvfile))) - 1
+        if save_to_dynamodb is True:
+            print("Saving to DynamoDB")
+            if RUN_AWS_FUNCTIONS == "1":
+                try:
+                    print("Connecting to DynamoDB via existing SSO connection")
+                    dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
+                    # client = boto3.client('dynamodb')
+                    dynamodb.meta.client.list_tables()
+                except Exception as e:
+                    print("No SSO credentials found:", e)
+                    if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+                        print("Trying DynamoDB credentials from environment variables")
+                        dynamodb = boto3.resource(
+                            "dynamodb",
+                            aws_access_key_id=AWS_ACCESS_KEY,
+                            aws_secret_access_key=AWS_SECRET_KEY,
+                            region_name=AWS_REGION,
+                        )
+                        # client = boto3.client('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
+                        #     aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
+                    else:
+                        raise Exception(
+                            "AWS credentials for DynamoDB logging not found"
+                        )
+            else:
+                raise Exception("AWS credentials for DynamoDB logging not found")
+            if dynamodb_table_name is None:
+                raise ValueError(
+                    "You must provide a dynamodb_table_name if save_to_dynamodb is True"
+                )
+            if dynamodb_headers:
+                dynamodb_headers = dynamodb_headers
+            if not dynamodb_headers and replacement_headers:
+                dynamodb_headers = replacement_headers
+            elif headers:
+                dynamodb_headers = headers
+            elif not dynamodb_headers:
+                raise ValueError(
+                    "Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table."
+                )
+            if flag_option is not None:
+                if "flag" not in dynamodb_headers:
+                    dynamodb_headers.append("flag")
+                if username is not None:
+                    if "username" not in dynamodb_headers:
+                        dynamodb_headers.append("username")
+                if "timestamp" not in dynamodb_headers:
+                    dynamodb_headers.append("timestamp")
+                if "id" not in dynamodb_headers:
+                    dynamodb_headers.append("id")
+            # Table doesn't exist — create it
+            try:
+                table = dynamodb.Table(dynamodb_table_name)
+                table.load()
+            except botocore.exceptions.ClientError as e:
+                if e.response["Error"]["Code"] == "ResourceNotFoundException":
+                    attribute_definitions = [
+                        {
+                            "AttributeName": "id",
+                            "AttributeType": "S",
+                        }  # Only define key attributes here
+                    ]
+                    table = dynamodb.create_table(
+                        TableName=dynamodb_table_name,
+                        KeySchema=[
+                            {"AttributeName": "id", "KeyType": "HASH"}  # Partition key
+                        ],
+                        AttributeDefinitions=attribute_definitions,
+                        BillingMode="PAY_PER_REQUEST",
+                    )
+                    # Wait until the table exists
+                    table.meta.client.get_waiter("table_exists").wait(
+                        TableName=dynamodb_table_name
+                    )
+                    time.sleep(5)
+                    print(f"Table '{dynamodb_table_name}' created successfully.")
+                else:
+                    raise
+            # Prepare the DynamoDB item to upload
+            try:
+                item = {
+                    "id": str(generated_id),  # UUID primary key
+                    #'created_by': username if username else "unknown",
+                    "timestamp": timestamp,
+                }
+                # Map the headers to values
+                item.update(
+                    {
+                        header: str(value)
+                        for header, value in zip(dynamodb_headers, csv_data)
+                    }
+                )
+                table.put_item(Item=item)
+                print("Successfully uploaded log to DynamoDB")
+            except Exception as e:
+                print("Could not upload log to DynamobDB due to", e)
+        return line_count

tools/dedup_summaries.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tools/example_table_outputs.py ADDED Viewed

	@@ -0,0 +1,94 @@

+dummy_consultation_table = """| General topic        | Subtopic               | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|:---------------------|:-----------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Development proposal | Affordable housing     | Positive    | All     |                     6 | The proposed development is overwhelmingly viewed favorably by the local community, primarily due to<br>its potential to address significant needs. residents express strong support for the provision of<br>amenities, particularly much-needed housing for young people and families. crucially, the<br>development is also considered vital for increasing green space, with some suggesting this could<br>even contribute to further housing opportunities. <br>furthermore, a key theme emerging from the<br>responses is the des... |
+| Development proposal | Environmental damage   | Negative    | All     |                     5 | A primary concern expressed across the dataset relates to the potential negative consequences of a<br>proposed development on the local environment and wildlife. multiple respondents highlighted worries<br>about environmental damage, suggesting a significant apprehension regarding the project’s impact.<br>specifically, there’s a shared concern about the detrimental effects on local wildlife, indicating a<br>potential disruption to the natural ecosystem.<br>furthermore, the proposed development is perceived<br>to ... |
+| Community impact     | Loss of local business | Negative    | All     |                     4 | A significant concern expressed within the dataset relates to the potential negative consequences of<br>a development project on local businesses and the wider community. multiple respondents voiced<br>worries about increased traffic congestion, directly impacting the viability of local businesses and<br>leading to apprehension about their future. specifically, there is a palpable sadness surrounding<br>the possibility of a beloved local cafe closing, emphasizing the detrimental effect on community<br>connect...     |
+| Development proposal | Architectural style    | Negative    | All     |                     4 | The primary concern regarding the proposed development is its incompatibility with the established<br>character of the area. residents express a strong feeling that the design is fundamentally at odds<br>with the existing aesthetic and atmosphere, suggesting a lack of sensitivity to the local context.<br>this sentiment highlights a significant apprehension about disrupting the area’s unique<br>identity.<br>furthermore, significant anxieties center on the potential negative impact of the<br>development on the surr... |
+| Economic impact      | Investment and jobs    | Positive    | All     |                     4 | The proposed development is widely anticipated to generate significant positive economic impacts<br>within the local community. residents believe it will lead to substantial investment and the<br>creation of numerous job opportunities, directly boosting the local economy and revitalizing the<br>town centre. there’s a strong consensus that this development represents a key step towards economic<br>growth and prosperity for the area.<br>specifically, the anticipated benefits include the creation<br>of jobs for loca... |
+| Development proposal | Height of building     | Negative    | All     |                     3 | Residents expressed significant concerns regarding the proposed development’s height, specifically<br>highlighting the five-storey structure as a major issue. this height was perceived as excessively<br>tall and likely to cause overshadowing of existing buildings in the area, directly impacting the<br>views enjoyed by current residents. the potential for diminished sunlight and altered visual<br>landscapes was a central point of contention.<br>furthermore, the impact on the existing character<br>of the neighborho... |
+| Development proposal | Infrastructure impact  | Negative    | All     |                     3 | Analysis of the provided text reveals significant concerns regarding the potential consequences of a<br>proposed project on existing infrastructure. specifically, there is a notable worry about the<br>detrimental effects on local infrastructure, with the possibility of widespread disruption as a key<br>consequence. this suggests a need for careful assessment and mitigation strategies to avoid<br>negatively impacting essential services and community operations.<br>furthermore, the repeated<br>emphasis on infrastru... |
+| Development proposal | Noise pollution        | Negative    | All     |                     3 | The primary concern highlighted within the dataset relates to anticipated noise pollution stemming<br>from the proposed development. multiple responses explicitly express this worry, emphasizing it as a<br>“significant concern” and a key area of apprehension. there’s a clear understanding that the<br>development will likely exacerbate existing noise levels within the surrounding area, suggesting a<br>potential negative impact on residents and the local environment.<br>several respondents reiterate<br>this concer...  |
+| Housing needs        | Supply of housing      | Positive    | All     |                     3 | The proposed development is viewed as a crucial solution to address the town’s significant housing<br>shortage, reflecting a clear desire for increased housing supply within the community. residents<br>express a strong need for more homes, and this development is seen as a key step towards meeting<br>that demand. <br>furthermore, the project is anticipated to alleviate existing parking issues, which<br>is considered a valuable contribution to the overall housing supply. the provision of additional<br>parking spac... |
+| Development proposal | Height of building     | Neutral     | All     |                     2 | The analysis of the provided text reveals a nuanced perspective regarding a development project,<br>with no explicit sentiment expressed concerning the building's height itself. however, significant<br>concerns are raised about the potential impact of the development on local schools. these concerns<br>appear to be linked, at least in part, to the building's height, suggesting a worry that the<br>increased scale could strain existing resources and infrastructure within the school system.<br><br>further investigat... |
+| Community impact     | Community facilities   | Negative    | All     |                     1 | Concerns exist regarding the negative impact on local amenities.                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Community impact     | Community facilities   | Positive    | All     |                     1 | The development will provide much-needed community facilities, enhancing the local area.                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Development proposal | Architectural style    | Neutral     | All     |                     1 | The development is expected to provide facilities for young people, but no specific architectural<br>concerns.                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Development proposal | Noise pollution        | Neutral     | All     |                     1 | Potential for increased noise pollution due to the development is a concern.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| Economic impact      | Economic decline       | Negative    | All     |                     1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm.                                                                                                                                                                                                                                                                                                                                                                                                                                            |"""
+dummy_consultation_table_zero_shot = """| General topic              | Subtopic                            | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|:---------------------------|:------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Planning & development     | Impact on the character of the area | Negative    | All     |                    10 | Residents overwhelmingly express strong objections to the proposed development, primarily focusing<br>on its incompatibility with the established character of the area. A central concern is the<br>development's height and design, which they believe clashes significantly with the existing<br>aesthetic and creates a sense of being overshadowed by taller structures, leading to a feeling of<br>crampedness. Many respondents specifically highlighted the potential for the development to<br>negatively impact Main Stre...       |
+| Environmental impact       | Impact on the local environment     | Negative    | All     |                     8 | Several concerns have been raised regarding the potential negative impacts of a development on the<br>local environment. Multiple respondents expressed worry about the development’s possible detrimental<br>effects on the surrounding environment and quality of life, highlighting a significant area of<br>concern. These anxieties include potential damage to the environment and a general feeling of unease<br>about the development’s consequences.<br><br>Despite a single positive note regarding the provision<br>of green s... |
+| Infrastructure & transport | Traffic congestion                  | Negative    | All     |                     7 | Concerns regarding increased traffic congestion are prevalent in the dataset, largely stemming from<br>the anticipated impact of the proposed development. Specifically, Main Street is predicted to<br>experience heightened congestion due to the increased volume of traffic it will attract. Multiple<br>responses repeatedly highlight this anticipation as a key issue associated with the<br>project.<br><br>Despite the consistent apprehension about traffic congestion, no direct responses<br>offer specific solutions or miti... |
+| Planning & development     | Need for family housing             | Positive    | All     |                     7 | The proposed development is overwhelmingly viewed as a crucial solution to the need for family<br>housing within the community. Multiple sources highlight its significance in providing much-needed<br>homes, particularly for families, and specifically addressing the demand for affordable family<br>housing options. Several respondents emphasized the beneficial impact on local residents, with the<br>development also anticipated to create jobs and offer facilities geared towards young people<br>alongside housing. ...       |
+| Quality of life            | Impact on quality of life           | Negative    | All     |                     7 | Analysis of the provided text reveals significant concerns regarding a proposed development's<br>potential negative impact on the quality of life within the area. Residents are particularly worried<br>that the development will overshadow existing buildings, creating a sense of crampedness and<br>diminishing their living experience. Furthermore, anxieties extend beyond immediate residential<br>impacts, encompassing broader concerns about the development’s effects on local businesses, schools,<br>and crucial inf...       |
+| Economic impact            | Investment and job creation         | Positive    | All     |                     6 | The proposed development is overwhelmingly viewed positively, with significant anticipation for its<br>economic impact on the area. Residents and observers alike believe it will stimulate considerable<br>investment and generate numerous job opportunities, particularly for local residents. Furthermore,<br>the project is expected to revitalize the town center and provide crucial affordable housing,<br>potentially benefiting young people seeking to establish themselves in the<br>community.<br><br>Specifically, the deve... |
+| Infrastructure & transport | Parking                             | Negative    | All     |                     6 | Analysis of the '{column_name}' column reveals significant concerns regarding the potential impact<br>of a new development on Main Street. The primary issue identified is increased traffic congestion,<br>directly linked to the development’s activity. Furthermore, there is widespread apprehension that<br>the project will worsen existing parking problems, with multiple respondents explicitly stating a<br>lack of adequate parking provisions as a key worry. <br><br>Specifically, numerous individuals<br>expressed concern... |
+| Community & local life     | Amenities for the local community   | Positive    | All     |                     5 | The proposed development is anticipated to significantly benefit the local community, offering a<br>range of amenities and a positive contribution to the area. Specifically, the project will deliver<br>crucial green space alongside facilities designed to cater to the needs of young people and the<br>broader community.<br><br>Furthermore, the development is expected to address critical social needs<br>by providing much-needed community facilities and social housing, indicating a commitment to<br>supporting local resi... |
+| Environmental impact       | Impact on local wildlife            | Neutral     | All     |                     4 | No specific responses were provided, and the dataset contained no information relevant to the<br>specified consultation context. Consequently, a summary cannot be generated based on the provided<br>data. <br><br>Due to the absence of any textual data within the dataset, there is no content to<br>consolidate and summarize.                                                                                                                                                                                                          |
+| Improvement of main street | Improvement of main street          | Positive    | All     |                     4 | This development is being hailed as a positive step for the revitalization of Main Street, primarily<br>due to its anticipated improvement in the street’s appearance. Stakeholders view this initiative as<br>a crucial element in breathing new life into the area, suggesting a significant upgrade to the<br>existing landscape.<br><br>Specifically, the project aims to enhance the visual appeal of Main<br>Street, representing a tangible advancement in its overall attractiveness and desirability. The<br>development is wide... |
+| Planning & development     | Impact on views                     | Negative    | All     |                     4 | A primary concern expressed regarding the proposed development is its potential negative impact on<br>existing views. Multiple respondents voiced worries about how the development might obstruct or<br>diminish the current vistas, alongside specific concerns about its effect on views from neighboring<br>properties. This suggests a significant sensitivity to the visual landscape and its value within the<br>community.<br><br>Furthermore, the potential aesthetic consequences of the development are<br>highlighted, with s... |
+| Community & local life     | Amenities for the local community   | Negative    | All     |                     2 | Residents are voicing significant concerns regarding a proposed development, primarily focusing on<br>its anticipated detrimental effects on local amenities. A key point of contention is the planned<br>removal of the existing cafe, which is being viewed as a substantial loss to the community’s social<br>fabric and a vital local resource.<br><br>The overall sentiment suggests a strong apprehension that<br>the development will diminish the quality of life for those living nearby, highlighting a desire to<br>preserve c... |
+| Impact on local businesses | Impact on local businesses          | Negative    | All     |                     2 | A primary concern expressed relates to the potential detrimental effects of the development on local<br>businesses. There’s a clear worry that the project will negatively impact these businesses,<br>suggesting a potential loss of revenue, customer base, or even business closure. The repeated<br>emphasis on a “negative impact” highlights a significant apprehension regarding the economic<br>repercussions for the existing business community.<br><br>The sentiment underscores a desire to<br>mitigate potential harm and li... |
+| Impact on local heritage   | Impact on local heritage            | Negative    | All     |                     2 | There are growing concerns regarding the potential negative impact of the development on the local<br>heritage. While specific details and references haven’t been explicitly stated, the underlying<br>sentiment suggests a worry about the development’s effects on historically significant elements<br>within the area. This implies a recognition that the proposed project could, perhaps inadvertently,<br>threaten or diminish the cultural value and character of the local environment.<br><br>The presence<br>of these concern... |
+| Environmental impact       | Impact on local wildlife            | Negative    | All     |                     1 | Concerns regarding the negative impact of the development on local wildlife.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| Impact on local heritage   | Impact on local heritage            | Neutral     | All     |                     1 | No specific responses mention this topic.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Impact on local schools    | Impact on local schools             | Negative    | All     |                     1 | Concerns about the negative impact on the local schools.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| Impact on local schools    | Impact on local schools             | Neutral     | All     |                     1 | No specific responses mention this topic.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Infrastructure & transport | Parking                             | Positive    | All     |                     1 | The development is expected to provide much-needed parking spaces.                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |"""
+case_notes_table = """| General topic     | Subtopic                    | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Family dynamics   | Parental conflict           | Negative    | All     |                     6 | Several parents expressed significant concerns regarding the well-being of their children, primarily<br>focusing on escalating aggression and withdrawal. alex’s mother specifically highlighted a pattern<br>of arguments at home and attributed the aggressive behavior to external provocation, suggesting a<br>destabilizing family environment. furthermore, parents voiced a lack of confidence in existing<br>interventions for their children, particularly jamie, indicating a perceived need for supplemental<br>support ...    |
+| Mental health     | Feelings of isolation       | Negative    | All     |                     6 | Several individuals expressed significant emotional distress, primarily centered around feelings of<br>isolation and hopelessness. jamie’s withdrawal and reported emptiness suggest a deep-seated sense of<br>disconnection, while alex powerfully articulated his feeling of being misunderstood with the<br>statement “no one gets me.” these experiences appear to be impacting daily functioning, as evidenced<br>by jamie’s struggles and disruption of his sleep patterns. <br>the observations from parents further<br>indicat... |
+| Overall mood      | Agitation                   | Negative    | All     |                     6 | Several individuals expressed concerns regarding heightened emotional distress within the subject<br>group, primarily focusing on alex. alex repeatedly demonstrated significant frustration and<br>aggression, necessitating anger management support, and appeared increasingly agitated in recent<br>meetings, suggesting a worsening emotional state. parents specifically highlighted jamie’s ongoing<br>struggles and agitation, emphasizing the need for a comprehensive assessment and subsequent<br>intervention.<br>while so... |
+| Family dynamics   | Peer influence              | Negative    | All     |                     5 | Concerns centered around alex’s social circle and behavior highlighted potential negative peer<br>influence, specifically due to his new friends and frequent late-night activities. further<br>investigation revealed troubling admissions from alex himself, including alcohol use and<br>participation in a physical altercation with a fellow student, indicating a concerning pattern of<br>risk-taking behavior. <br>simultaneously, jamie’s situation presented a separate area of concern,<br>characterized by isolation and l... |
+| Substance use     | Potential substance abuse   | Negative    | All     |                     5 | Alex has disclosed alcohol use, raising significant concerns about potential ongoing substance<br>abuse. the situation is further complicated by reports from alex’s mother, who has observed<br>potential signs of substance abuse and expressed her worries regarding this matter. these<br>observations highlight a need for further assessment and support to address the individual’s<br>substance use patterns and ensure their well-being.<br>the situation requires careful monitoring and<br>intervention. the mother’s repor... |
+| Mental health     | Depression                  | Positive    | All     |                     3 | Jamie is diagnosed with major depressive disorder and initiated on antidepressant medication, with<br>initial positive feedback on mood and energy.                                                                                                                                                                                                                                                                                                                                                                                       |
+| Mental health     | Self-harm                   | Negative    | All     |                     3 | The assessment revealed a complex picture regarding the individual’s mental state. while initial<br>observations did not indicate any active self-harm, a thorough evaluation is strongly recommended to<br>identify potential underlying issues contributing to the risk. this proactive approach is crucial<br>for a complete understanding of the individual’s needs.<br>more concerningly, alex presented with<br>visible self-harm indicators on his arms and explicitly communicated thoughts of self-harm,<br>signifying a sign... |
+| School engagement | Absenteeism                 | Negative    | All     |                     3 | Recent reports highlight a concerning trend of declining student engagement within the school<br>environment. specifically, there has been an increase in absences alongside a decrease in academic<br>performance, suggesting a fundamental lack of connection with schoolwork and learning. several<br>students, including jamie, are exhibiting problematic behaviors that further underscore this issue,<br>such as consistent tardiness and reduced participation in classroom activities.<br>furthermore,<br>observations indica... |
+| Mental health     | Depression                  | Negative    | All     |                     2 | Jamie exhibits symptoms of moderate depression, requiring further evaluation and intervention.                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Mental health     | Self-harm                   | Neutral     | All     |                     2 | The psychiatrist’s assessment centered on the potential advantages and drawbacks of antidepressant<br>medication, with a notable emphasis on evaluating the possibility of self-harm risk. this indicates<br>a proactive approach to patient safety and a recognition of the complex interplay between medication<br>and mental health. the discussion highlights a careful consideration of the potential for increased<br>suicidal ideation, suggesting a thorough risk assessment was undertaken.<br>furthermore, the<br>analysis o... |
+| School engagement | Academic performance        | Negative    | All     |                     2 | Analysis of the provided text reveals concerns regarding student engagement and academic<br>performance. specifically, jamie’s reduced involvement in class is flagged as a potential indicator<br>of negative consequences, with declining grades reported as a direct result. this suggests a<br>concerning downward trend in alex’s academic progress, highlighting a need for further investigation<br>into the underlying causes of this shift.<br>the combined observations point to a possible<br>correlation between decreased... |
+| Substance use     | Substance use (unspecified) | Negative    | All     |                     2 | Concerns regarding ongoing substance use prompted discussion about the possibility of a short-term<br>residential treatment program. alex’s involvement highlighted a potential issue, as they reported<br>occasional substance use, though the specific substances involved were not detailed during the<br>consultation. this lack of specificity regarding the substances used raises a need for further<br>investigation into the nature and frequency of alex’s substance use.<br>the consultation focused on<br>assessing the ri... |
+| Family dynamics   | Stepfather relationship     | Negative    | All     |                     1 | Alex displayed sudden outbursts of anger when discussing his new stepfather, indicating significant<br>distress related to this family change.                                                                                                                                                                                                                                                                                                                                                                                            |
+| School engagement | Academic performance        | Positive    | All     |                     1 | Jamie's academic performance has slightly improved, indicating a potential positive change.                                                                                                                                                                                                                                                                                                                                                                                                                                               |"""
+case_notes_table_grouped = """| General topic       | Subtopic                   | Sentiment   | Group    |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|:--------------------|:---------------------------|:------------|:---------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Trends over time    | Trends over time           | Negative    | Alex D.  |                     7 | Alex’s case note reveals a troubling deterioration in his well-being marked by a gradual escalation<br>of issues. Initially, the record details an incident involving a physical altercation, which quickly<br>spiraled into increasingly concerning behaviours at home, specifically escalating aggression. Over<br>subsequent meetings, observations consistently pointed towards heightened agitation and expressions<br>of hopelessness, indicating a worsening emotional state and a significant decline in his overall<br>con...       |
+| Physical health     | Substance misuse           | Negative    | Alex D.  |                     6 | Alex’s substance use remains a significant concern, necessitating continued vigilance and support<br>despite recent positive developments in group therapy. While Alex has acknowledged instances of<br>substance use, the details surrounding these occurrences have not been shared, raising questions<br>about the extent and nature of the problem. Concerns were specifically noted regarding potential<br>substance abuse, highlighting a need for further investigation and assessment.<br><br>Ongoing<br>monitoring is crucial to... |
+| Behaviour at school | Behaviour at school        | Negative    | Alex D.  |                     3 | A recent case note details a troubling incident involving a physical altercation at school,<br>alongside concerning admissions from Alex regarding alcohol use. This event has sparked worries<br>about potential behavioural issues within the school setting, suggesting a need for further<br>investigation and support. Alex’s demeanor was notably problematic, characterized by sullen behavior<br>and a deliberate avoidance of eye contact, indicating a possible struggle with emotional<br>regulation.<br><br>Furthermore, Alex... |
+| Mental health       | Anger                      | Negative    | Alex D.  |                     3 | Alex exhibits a pronounced anger issue, characterized by frustration and a tendency to blame others<br>for triggering his aggressive behavior. He demonstrated this significantly when discussing his<br>personal life, particularly relating to his new stepfather, suggesting a volatile emotional response<br>to this change. The observed outbursts highlight a need for immediate intervention to manage his<br>escalating anger.<br><br>Further investigation reveals that Alex’s anger is closely linked to his<br>home environmen... |
+| Mental health       | Self-harm                  | Negative    | Alex D.  |                     3 | The analysis reveals significant concerns regarding Alex’s mental health, centering around potential<br>self-harm behaviors. Indications suggest a possible diagnosis of Oppositional Defiant Disorder<br>alongside a co-occurring substance use disorder, warranting a comprehensive treatment plan. Alex<br>demonstrated visible signs of self-harm and openly confessed to experiencing thoughts of self-harm,<br>highlighting a critical need for immediate intervention.<br><br>Following this disclosure, an<br>immediate referral ... |
+| Mental health       | Social issues              | Negative    | Alex D.  |                     3 | Alex exhibits a pattern of blaming others for his problematic behavior, indicating underlying<br>challenges in social interaction and conflict resolution. This behavior appears to be contributing<br>to further instability in his life. Specifically, his mother voiced concerns regarding his new<br>social circle and increasingly frequent late-night activities, suggesting she perceives these<br>relationships and outings as potentially risky.<br><br>The mother’s observations highlight a<br>potential area of concern for A... |
+| Mental health       | Depression                 | Negative    | Jamie L. |                     6 | Jamie is currently experiencing concerning symptoms indicative of depression, as noted by both<br>Jamie’s behavior and parental observations. Specifically, he demonstrates limited social<br>interaction, struggles with his mood, and has difficulty engaging with his schoolwork. These<br>difficulties appear persistent, with parents reporting ongoing struggles despite occasional positive<br>moments. <br><br>Further assessment suggests a more pronounced picture, with indications of moderate<br>depression characterized by... |
+| Mental health       | Social isolation           | Negative    | Jamie L. |                     4 | Jamie is experiencing significant social isolation, which is negatively affecting both his academic<br>performance and his general well-being. He has expressed feelings of loneliness and difficulty<br>sleeping, strongly suggesting a core social issue is contributing to his distress. Current efforts<br>are focused on promoting increased social interaction to address these challenges.<br><br>The report<br>highlights the urgency of this situation, emphasizing the need for intervention to mitigate Jamie’s<br>isolation a... |
+| Mental health       | Medication                 | Neutral     | Jamie L. |                     3 | Consideration is being given to medication as a potential intervention alongside therapy to manage<br>depressive symptoms. Initial feedback on the antidepressant is positive.                                                                                                                                                                                                                                                                                                                                                               |
+| Mental health       | Withdrawal & sadness       | Negative    | Jamie L. |                     3 | Jamie is experiencing a significant downturn in his emotional state, characterized by withdrawal,<br>sadness, and a pervasive sense of emptiness and hopelessness. These negative feelings appear to be<br>triggered by recent reports of tardiness and decreased participation, suggesting a possible link<br>between his behavior and external pressures or expectations. The combination of these symptoms<br>points to a low mood and a feeling of struggle, indicating a potentially serious situation requiring<br>attention....       |
+| Mental health       | Low self-worth             | Negative    | Jamie L. |                     2 | Parents are increasingly concerned about Jamie’s well-being due to observed difficulties and a<br>potential lack of self-worth. These concerns are primarily fueled by Jamie’s own statements, where<br>he articulated feelings of low self-esteem and a significant struggle to find<br>motivation.<br><br>Further investigation revealed a direct link between Jamie’s emotional state and<br>recent family financial hardships. The pressures of these struggles appear to have deeply impacted<br>his self-perception and ability to ... |
+| Trends over time    | Increasing withdrawal      | Negative    | Jamie L. |                     2 | A significant and worrying trend is emerging regarding withdrawal, necessitating continuous<br>observation and targeted intervention strategies. Specifically, Jamie is exhibiting a noticeable<br>decline in engagement with family activities, representing a key indicator of this broader issue.<br>This withdrawal suggests a potential underlying problem requiring careful assessment and proactive<br>support.<br><br>The observed pattern of withdrawal highlights the importance of sustained monitoring<br>to understand its p... |
+| Behaviour at school | Attendance issues          | Negative    | Jamie L. |                     1 | Jamie’s consistent tardiness was a concern leading to a meeting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| Behaviour at school | Reduced participation      | Negative    | Jamie L. |                     1 | Jamie’s decreased participation in class was noted.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Behaviour at school | Social engagement          | Negative    | Jamie L. |                     1 | Jamie's withdrawal from family activities and hobbies was highlighted.                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Behaviour at school | Social engagement          | Positive    | Jamie L. |                     1 | Encouraging Jamie to join school clubs and groups is a strategy to foster social connection and<br>improve his social engagement.                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Family & social     | Family communication       | Negative    | Jamie L. |                     1 | Parents expressed concerns about Jamie’s withdrawal and lack of communication within the family.                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| Family & social     | Family communication       | Neutral     | Jamie L. |                     1 | Parents are actively involved in Jamie's care and are communicating their observations to the care<br>team.                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Family & social     | Family financial struggles | Negative    | Jamie L. |                     1 | Jamie's low motivation is attributed to recent family financial difficulties.                                                                                                                                                                                                                                                                                                                                                                                                                                                                |"""
+case_notes_table_structured_summary = """| Main heading        | Subheading          | Summary                                                                                                                                                                                                                                                                                                                                                       | Group    |
+|:--------------------|:--------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|
+| Behaviour at school | Behaviour at school | Several cases involved disruptions at school, including increased absences, declining grades, and a<br>physical altercation. Alex displayed sullenness, avoidance, and agitation, sometimes reacting with<br>frustration. A key theme was isolation and a lack of connection with peers and school staff.                                                     | Alex D.  |
+| Mental health       | Anger               | Anger was a prominent feature across multiple cases, particularly when discussing home life and<br>family dynamics. Outbursts of anger were observed, especially related to a new stepfather, and Alex<br>displayed defensiveness when questioned about his actions.                                                                                          | Alex D.  |
+| Mental health       | Social issues       | Alex experienced feelings of isolation and difficulty connecting with others. He had a new group of<br>friends and engaged in late-night outings, which raised concerns about potential risky behaviours<br>and social influences.                                                                                                                            | Alex D.  |
+| Physical health     | General             | Signs of self-harm were present on Alex’s arms, indicating a heightened level of distress and<br>potentially a need for immediate support. He displayed visible agitation and defensive behaviour<br>during questioning.                                                                                                                                      | Alex D.  |
+| Physical health     | Substance misuse    | Substance use was a recurring concern, with Alex admitting to occasional substance use and his<br>mother reporting potential signs of abuse. Alcohol use was noted in several instances, leading to<br>recommendations for assessment and potential intervention.                                                                                             | Alex D.  |
+| Trends over time    | Trends over time    | There was a gradual escalation of concerning behaviours over time. Early interventions focused on<br>initial meetings and observation, progressing to more intensive interventions like referrals to<br>mental health professionals, residential treatment programs, and family counseling.                                                                   | Alex D.  |
+| Behaviour at school | Behaviour at school | Jamie exhibited concerning behaviours at school, including consistent tardiness and decreased<br>participation in class. This was accompanied by withdrawn behaviour and signs of sadness, suggesting<br>a need for immediate intervention to address potential underlying issues impacting his academic<br>performance.                                      | Jamie L. |
+| Mental health       | Anger               | There is no direct indication of anger in Jamie's case notes.                                                                                                                                                                                                                                                                                                 | Jamie L. |
+| Mental health       | Mental health       | Jamie displayed concerning signs of mental health difficulties, including feelings of emptiness,<br>hopelessness, low self-worth, and isolation. He reported difficulty sleeping and a lack of<br>motivation. The need for a comprehensive mental health assessment was highlighted to fully<br>understand the nature and severity of his condition.          | Jamie L. |
+| Mental health       | Social issues       | Jamie experienced significant social difficulties, including limited social interactions, feelings<br>of isolation, and a lack of engagement with family activities and hobbies. He spends a lot of time<br>alone in his room. Recommendations focused on fostering connection through school clubs and family<br>therapy were made.                          | Jamie L. |
+| Physical health     | General             | While no direct physical health concerns were explicitly stated, Jamie's emotional state and<br>associated symptoms (difficulty sleeping) warrant consideration of his overall well-being and<br>potential physical manifestations of his mental health challenges.                                                                                           | Jamie L. |
+| Physical health     | Substance misuse    | There is no indication of substance misuse in the provided case notes.                                                                                                                                                                                                                                                                                        | Jamie L. |
+| Trends over time    | Trends over time    | Jamie’s case demonstrates fluctuating progress. Initial feedback indicated slight improvements in<br>mood on some days, but overall he continues to struggle. A shift occurred with the commencement of<br>antidepressant medication, showing initial positive feedback in terms of mood and energy levels,<br>requiring continued monitoring and adjustment. | Jamie L. |"""

tools/helper_functions.py ADDED Viewed

	@@ -0,0 +1,1245 @@

+import codecs
+import math
+import os
+import re
+from typing import List
+import boto3
+import gradio as gr
+import numpy as np
+import pandas as pd
+from botocore.exceptions import ClientError
+from tools.config import (
+    AWS_USER_POOL_ID,
+    CUSTOM_HEADER,
+    CUSTOM_HEADER_VALUE,
+    INPUT_FOLDER,
+    MAXIMUM_ZERO_SHOT_TOPICS,
+    OUTPUT_FOLDER,
+    SESSION_OUTPUT_FOLDER,
+    model_full_names,
+    model_name_map,
+)
+def empty_output_vars_extract_topics():
+    # Empty output objects before processing a new file
+    master_topic_df_state = pd.DataFrame()
+    master_topic_summary_df_state = pd.DataFrame()
+    master_reference_df_state = pd.DataFrame()
+    text_output_file = list()
+    text_output_file_list_state = list()
+    latest_batch_completed = 0
+    log_files_output = list()
+    log_files_output_list_state = list()
+    conversation_metadata_textbox = ""
+    estimated_time_taken_number = 0
+    file_data_state = pd.DataFrame()
+    reference_data_file_name_textbox = ""
+    display_topic_table_markdown = ""
+    summary_output_file_list = list()
+    summary_input_file_list = list()
+    overall_summarisation_input_files = list()
+    overall_summary_output_files = list()
+    return (
+        master_topic_df_state,
+        master_topic_summary_df_state,
+        master_reference_df_state,
+        text_output_file,
+        text_output_file_list_state,
+        latest_batch_completed,
+        log_files_output,
+        log_files_output_list_state,
+        conversation_metadata_textbox,
+        estimated_time_taken_number,
+        file_data_state,
+        reference_data_file_name_textbox,
+        display_topic_table_markdown,
+        summary_output_file_list,
+        summary_input_file_list,
+        overall_summarisation_input_files,
+        overall_summary_output_files,
+    )
+def empty_output_vars_summarise():
+    # Empty output objects before summarising files
+    summary_reference_table_sample_state = pd.DataFrame()
+    master_topic_summary_df_revised_summaries_state = pd.DataFrame()
+    master_reference_df_revised_summaries_state = pd.DataFrame()
+    summary_output_files = list()
+    summarised_outputs_list = list()
+    latest_summary_completed_num = 0
+    overall_summarisation_input_files = list()
+    return (
+        summary_reference_table_sample_state,
+        master_topic_summary_df_revised_summaries_state,
+        master_reference_df_revised_summaries_state,
+        summary_output_files,
+        summarised_outputs_list,
+        latest_summary_completed_num,
+        overall_summarisation_input_files,
+    )
+def get_or_create_env_var(var_name: str, default_value: str):
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set it to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    return value
+def get_file_path_with_extension(file_path: str):
+    # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+    basename = os.path.basename(file_path)
+    # Return the basename with its extension
+    return basename
+def get_file_name_no_ext(file_path: str):
+    # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+    basename = os.path.basename(file_path)
+    # Then, split the basename and its extension and return only the basename without the extension
+    filename_without_extension, _ = os.path.splitext(basename)
+    # print(filename_without_extension)
+    return filename_without_extension
+def detect_file_type(filename: str):
+    """Detect the file type based on its extension."""
+    # Strip quotes and whitespace that might have been accidentally included
+    filename = filename.strip().strip("'\"")
+    if (
+        (filename.endswith(".csv"))
+        | (filename.endswith(".csv.gz"))
+        | (filename.endswith(".zip"))
+    ):
+        return "csv"
+    elif filename.endswith(".xlsx"):
+        return "xlsx"
+    elif filename.endswith(".parquet"):
+        return "parquet"
+    elif filename.endswith(".pdf"):
+        return "pdf"
+    elif filename.endswith(".jpg"):
+        return "jpg"
+    elif filename.endswith(".jpeg"):
+        return "jpeg"
+    elif filename.endswith(".png"):
+        return "png"
+    else:
+        raise ValueError("Unsupported file type.")
+def read_file(filename: str, sheet: str = ""):
+    """Read the file based on its detected type."""
+    # Strip quotes and whitespace that might have been accidentally included
+    filename = filename.strip().strip("'\"")
+    file_type = detect_file_type(filename)
+    if file_type == "csv":
+        return pd.read_csv(filename, low_memory=False)
+    elif file_type == "xlsx":
+        if sheet:
+            return pd.read_excel(filename, sheet_name=sheet)
+        else:
+            return pd.read_excel(filename)
+    elif file_type == "parquet":
+        return pd.read_parquet(filename)
+def load_in_file(file_path: str, colnames: List[str] = "", excel_sheet: str = ""):
+    """
+    Loads in a tabular data file and returns data and file name.
+    Parameters:
+    - file_path (str): The path to the file to be processed.
+    - colnames (List[str], optional): list of colnames to load in
+    """
+    file_name = get_file_name_no_ext(file_path)
+    file_data = read_file(file_path, excel_sheet)
+    if colnames and isinstance(colnames, list):
+        col_list = colnames
+    else:
+        col_list = list(file_data.columns)
+    if not isinstance(col_list, List):
+        col_list = [col_list]
+    col_list = [item for item in col_list if item not in ["", "NA"]]
+    for col in col_list:
+        file_data[col] = file_data[col].fillna("")
+        file_data[col] = (
+            file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
+        )
+        # print(file_data[colnames])
+    return file_data, file_name
+def load_in_data_file(
+    file_paths: List[str],
+    in_colnames: List[str],
+    batch_size: int = 5,
+    in_excel_sheets: str = "",
+):
+    """Load in data table, work out how many batches needed."""
+    if not isinstance(in_colnames, list):
+        in_colnames = [in_colnames]
+    # print("in_colnames:", in_colnames)
+    try:
+        file_data, file_name = load_in_file(
+            file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets
+        )
+        num_batches = math.ceil(len(file_data) / batch_size)
+        print(
+            f"File {file_name} loaded successfully. Number of rows: {len(file_data)}. Total number of batches: {num_batches}"
+        )
+    except Exception as e:
+        print("Could not load data file due to:", e)
+        file_data = pd.DataFrame()
+        file_name = ""
+        num_batches = 1
+    return file_data, file_name, num_batches
+def clean_column_name(
+    column_name: str, max_length: int = 20, front_characters: bool = True
+):
+    # Convert to string
+    column_name = str(column_name)
+    # Replace non-alphanumeric characters (except underscores) with underscores
+    column_name = re.sub(r"\W+", "_", column_name)
+    # Remove leading/trailing underscores
+    column_name = column_name.strip("_")
+    # Ensure the result is not empty; fall back to "column" if necessary
+    column_name = column_name if column_name else "column"
+    # Truncate to max_length
+    if front_characters is True:
+        output_text = column_name[:max_length]
+    else:
+        output_text = column_name[-max_length:]
+    return output_text
+def load_in_previous_reference_file(file: str):
+    """Load in data table from a partially completed consultation summary to continue it."""
+    reference_file_data = pd.DataFrame()
+    reference_file_name = ""
+    out_message = ""
+    # for file in file_paths:
+    print("file:", file)
+    # If reference table
+    if "reference_table" in file:
+        try:
+            reference_file_data, reference_file_name = load_in_file(file)
+            # print("reference_file_data:", reference_file_data.head(2))
+            out_message = out_message + " Reference file load successful."
+        except Exception as e:
+            out_message = "Could not load reference file data:" + str(e)
+            raise Exception("Could not load reference file data:", e)
+    if reference_file_data.empty:
+        out_message = out_message + " No reference data table provided."
+        raise Exception(out_message)
+    print(out_message)
+    return reference_file_data, reference_file_name
+def load_in_previous_data_files(
+    file_paths_partial_output: List[str], for_modified_table: bool = False
+):
+    """Load in data table from a partially completed consultation summary to continue it."""
+    reference_file_data = pd.DataFrame()
+    reference_file_name = ""
+    unique_file_data = pd.DataFrame()
+    unique_file_name = ""
+    out_message = ""
+    latest_batch = 0
+    if not file_paths_partial_output:
+        out_message = out_message + " No reference or unique data table provided."
+        return (
+            reference_file_data,
+            unique_file_data,
+            latest_batch,
+            out_message,
+            reference_file_name,
+            unique_file_name,
+        )
+    if not isinstance(file_paths_partial_output, list):
+        file_paths_partial_output = [file_paths_partial_output]
+    for file in file_paths_partial_output:
+        if isinstance(file, gr.FileData):
+            name = file.name
+        else:
+            name = file
+        # If reference table
+        if "reference_table" in name:
+            try:
+                reference_file_data, reference_file_name = load_in_file(file)
+                # print("reference_file_data:", reference_file_data.head(2))
+                out_message = out_message + " Reference file load successful."
+            except Exception as e:
+                out_message = "Could not load reference file data:" + str(e)
+                raise Exception("Could not load reference file data:", e)
+        # If unique table
+        if "unique_topic" in name:
+            try:
+                unique_file_data, unique_file_name = load_in_file(file)
+                # print("unique_topics_file:", unique_file_data.head(2))
+                out_message = out_message + " Unique table file load successful."
+            except Exception as e:
+                out_message = "Could not load unique table file data:" + str(e)
+                raise Exception("Could not load unique table file data:", e)
+        if "batch_" in name:
+            latest_batch = re.search(r"batch_(\d+)", file.name).group(1)
+            print("latest batch:", latest_batch)
+            latest_batch = int(latest_batch)
+    if latest_batch == 0:
+        out_message = out_message + " Latest batch number not found."
+    if reference_file_data.empty:
+        out_message = out_message + " No reference data table provided."
+        # raise Exception(out_message)
+    if unique_file_data.empty:
+        out_message = out_message + " No unique data table provided."
+    print(out_message)
+    # Return all data if using for deduplication task. Return just modified unique table if using just for table modification
+    if for_modified_table is False:
+        return (
+            reference_file_data,
+            unique_file_data,
+            latest_batch,
+            out_message,
+            reference_file_name,
+            unique_file_name,
+        )
+    else:
+        reference_file_data.drop("Topic number", axis=1, inplace=True, errors="ignore")
+        unique_file_data = create_topic_summary_df_from_reference_table(
+            reference_file_data
+        )
+        unique_file_data.drop("Summary", axis=1, inplace=True)
+        # Then merge the topic numbers back to the original dataframe
+        reference_file_data = reference_file_data.merge(
+            unique_file_data[
+                ["General topic", "Subtopic", "Sentiment", "Topic number"]
+            ],
+            on=["General topic", "Subtopic", "Sentiment"],
+            how="left",
+        )
+        out_file_names = [reference_file_name + ".csv"]
+        out_file_names.append(unique_file_name + ".csv")
+        return (
+            unique_file_data,
+            reference_file_data,
+            unique_file_data,
+            reference_file_name,
+            unique_file_name,
+            out_file_names,
+        )  # gr.Dataframe(value=unique_file_data, headers=None, column_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas")
+def join_cols_onto_reference_df(
+    reference_df: pd.DataFrame,
+    original_data_df: pd.DataFrame,
+    join_columns: List[str],
+    original_file_name: str,
+    output_folder: str = OUTPUT_FOLDER,
+):
+    # print("original_data_df columns:", original_data_df.columns)
+    # print("original_data_df:", original_data_df)
+    original_data_df.reset_index(names="Response References", inplace=True)
+    original_data_df["Response References"] += 1
+    # print("reference_df columns:", reference_df.columns)
+    # print("reference_df:", reference_df)
+    join_columns.append("Response References")
+    reference_df["Response References"] = (
+        reference_df["Response References"].fillna("-1").astype(int)
+    )
+    save_file_name = output_folder + original_file_name + "_j.csv"
+    out_reference_df = reference_df.merge(
+        original_data_df[join_columns], on="Response References", how="left"
+    )
+    out_reference_df.to_csv(save_file_name, index=None)
+    file_data_outputs = [save_file_name]
+    return out_reference_df, file_data_outputs
+def get_basic_response_data(
+    file_data: pd.DataFrame, chosen_cols: List[str], verify_titles: bool = False
+) -> pd.DataFrame:
+    if not isinstance(chosen_cols, list):
+        chosen_cols = [chosen_cols]
+    if chosen_cols[0] not in file_data.columns:
+        error_msg = (
+            f"Column '{chosen_cols[0]}' not found in file_data columns. "
+            f"Available columns: {list(file_data.columns)}"
+        )
+        print(error_msg)
+        raise KeyError(error_msg)
+    # If verify_titles is True, we need to check and include the second column
+    if verify_titles is True:
+        if len(chosen_cols) < 2:
+            error_msg = (
+                "verify_titles is True but only one column provided. "
+                "Need at least 2 columns: one for response text and one for title."
+            )
+            print(error_msg)
+            raise ValueError(error_msg)
+        if chosen_cols[1] not in file_data.columns:
+            error_msg = (
+                f"Column '{chosen_cols[1]}' not found in file_data columns for title. "
+                f"Available columns: {list(file_data.columns)}"
+            )
+            print(error_msg)
+            raise KeyError(error_msg)
+        # Include both columns when verify_titles is True
+        basic_response_data = file_data[[chosen_cols[0], chosen_cols[1]]]
+        basic_response_data = basic_response_data.rename(
+            columns={
+                basic_response_data.columns[0]: "Response",
+                basic_response_data.columns[1]: "Title",
+            }
+        )
+    else:
+        basic_response_data = file_data[[chosen_cols[0]]]
+        basic_response_data = basic_response_data.rename(
+            columns={basic_response_data.columns[0]: "Response"}
+        )
+    basic_response_data = basic_response_data.reset_index(
+        names="Original Reference"
+    )  # .reset_index(drop=True) #
+    # Try to convert to int, if it fails, return a range of 1 to last row + 1
+    try:
+        basic_response_data["Original Reference"] = (
+            basic_response_data["Original Reference"].astype(int) + 1
+        )
+    except (ValueError, TypeError):
+        basic_response_data["Original Reference"] = range(
+            1, len(basic_response_data) + 1
+        )
+    basic_response_data["Reference"] = basic_response_data.index.astype(int) + 1
+    if verify_titles is True:
+        basic_response_data["Title"] = basic_response_data["Title"].str.strip()
+        basic_response_data["Title"] = basic_response_data["Title"].apply(initial_clean)
+    else:
+        basic_response_data = basic_response_data[
+            ["Reference", "Response", "Original Reference"]
+        ]
+    basic_response_data["Response"] = basic_response_data["Response"].str.strip()
+    basic_response_data["Response"] = basic_response_data["Response"].apply(
+        initial_clean
+    )
+    return basic_response_data
+def convert_reference_table_to_pivot_table(
+    df: pd.DataFrame, basic_response_data: pd.DataFrame = pd.DataFrame()
+):
+    df_in = df[["Response References", "General topic", "Subtopic", "Sentiment"]].copy()
+    df_in["Response References"] = df_in["Response References"].astype(int)
+    # Create a combined category column
+    df_in["Category"] = (
+        df_in["General topic"] + " - " + df_in["Subtopic"] + " - " + df_in["Sentiment"]
+    )
+    # Create pivot table counting occurrences of each unique combination
+    pivot_table = pd.crosstab(
+        index=df_in["Response References"],
+        columns=[df_in["General topic"], df_in["Subtopic"], df_in["Sentiment"]],
+        margins=True,
+    )
+    # Flatten column names to make them more readable
+    pivot_table.columns = [" - ".join(col) for col in pivot_table.columns]
+    pivot_table.reset_index(inplace=True)
+    if not basic_response_data.empty:
+        pivot_table = basic_response_data.merge(
+            pivot_table, right_on="Response References", left_on="Reference", how="left"
+        )
+        pivot_table.drop("Response References", axis=1, inplace=True)
+    pivot_table.columns = pivot_table.columns.str.replace(
+        "Not assessed - ", ""
+    ).str.replace("- Not assessed", "")
+    return pivot_table
+def create_topic_summary_df_from_reference_table(reference_df: pd.DataFrame):
+    if "Group" not in reference_df.columns:
+        reference_df["Group"] = "All"
+    # Ensure 'Start row of group' column is numeric to avoid comparison errors
+    if "Start row of group" in reference_df.columns:
+        reference_df["Start row of group"] = pd.to_numeric(
+            reference_df["Start row of group"], errors="coerce"
+        )
+    out_topic_summary_df = (
+        reference_df.groupby(["General topic", "Subtopic", "Sentiment", "Group"])
+        .agg(
+            {
+                "Response References": "size",  # Count the number of references
+                "Summary": lambda x: "<br>".join(
+                    sorted(
+                        set(x),
+                        key=lambda summary: reference_df.loc[
+                            reference_df["Summary"] == summary, "Start row of group"
+                        ].min(),
+                    )
+                ),
+            }
+        )
+        .reset_index()
+        # .sort_values('Response References', ascending=False)  # Sort by size, biggest first
+    )
+    out_topic_summary_df = out_topic_summary_df.rename(
+        columns={"Response References": "Number of responses"}, errors="ignore"
+    )
+    # Sort the dataframe first
+    out_topic_summary_df = out_topic_summary_df.sort_values(
+        ["Group", "Number of responses", "General topic", "Subtopic", "Sentiment"],
+        ascending=[True, False, True, True, True],
+    )
+    # Then assign Topic number based on the final sorted order
+    out_topic_summary_df = out_topic_summary_df.assign(
+        Topic_number=lambda df: np.arange(1, len(df) + 1)
+    )
+    out_topic_summary_df.rename(columns={"Topic_number": "Topic number"}, inplace=True)
+    return out_topic_summary_df
+# Wrap text in each column to the specified max width, including whole words
+def wrap_text(text: str, max_width=80, max_text_length=None):
+    if not isinstance(text, str):
+        return text
+    # If max_text_length is set, truncate the text and add ellipsis
+    if max_text_length and len(text) > max_text_length:
+        text = text[:max_text_length] + "..."
+    text = text.replace("\r\n", "<br>").replace("\n", "<br>")
+    words = text.split()
+    if not words:
+        return text
+    # First pass: initial word wrapping
+    wrapped_lines = list()
+    current_line = list()
+    current_length = 0
+    def add_line():
+        if current_line:
+            wrapped_lines.append(" ".join(current_line))
+            current_line.clear()
+    for i, word in enumerate(words):
+        word_length = len(word)
+        # Handle words longer than max_width
+        if word_length > max_width:
+            add_line()
+            wrapped_lines.append(word)
+            current_length = 0
+            continue
+        # Calculate space needed for this word
+        space_needed = word_length if not current_line else word_length + 1
+        # Check if adding this word would exceed max_width
+        if current_length + space_needed > max_width:
+            add_line()
+            current_line.append(word)
+            current_length = word_length
+        else:
+            current_line.append(word)
+            current_length += space_needed
+    add_line()  # Add any remaining text
+    # Second pass: redistribute words from lines following single-word lines
+    def can_fit_in_previous_line(prev_line, word):
+        return len(prev_line) + 1 + len(word) <= max_width
+    i = 0
+    while i < len(wrapped_lines) - 1:
+        words_in_line = wrapped_lines[i].split()
+        next_line_words = wrapped_lines[i + 1].split()
+        # If current line has only one word and isn't too long
+        if len(words_in_line) == 1 and len(words_in_line[0]) < max_width * 0.8:
+            # Try to bring words back from the next line
+            words_to_bring_back = list()
+            remaining_words = list()
+            current_length = len(words_in_line[0])
+            for word in next_line_words:
+                if current_length + len(word) + 1 <= max_width:
+                    words_to_bring_back.append(word)
+                    current_length += len(word) + 1
+                else:
+                    remaining_words.append(word)
+            if words_to_bring_back:
+                # Update current line with additional words
+                wrapped_lines[i] = " ".join(words_in_line + words_to_bring_back)
+                # Update next line with remaining words
+                if remaining_words:
+                    wrapped_lines[i + 1] = " ".join(remaining_words)
+                else:
+                    wrapped_lines.pop(i + 1)
+                    continue  # Don't increment i if we removed a line
+        i += 1
+    return "<br>".join(wrapped_lines)
+def initial_clean(text: str):
+    #### Some of my cleaning functions
+    html_pattern_regex = r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;"
+    html_start_pattern_end_dots_regex = r"<(.*?)\.\."
+    non_ascii_pattern = r"[^\x00-\x7F]+"
+    multiple_spaces_regex = r"\s{2,}"
+    # Define a list of patterns and their replacements
+    patterns = [
+        (html_pattern_regex, " "),
+        (html_start_pattern_end_dots_regex, " "),
+        (non_ascii_pattern, " "),
+        (multiple_spaces_regex, " "),
+    ]
+    # Apply each regex replacement
+    for pattern, replacement in patterns:
+        text = re.sub(pattern, replacement, text)
+    return text
+def view_table(file_path: str):  # Added max_width parameter
+    df = pd.read_csv(file_path)
+    df_cleaned = df.replace("\n", " ", regex=True)
+    # Use apply with axis=1 to apply wrap_text to each element
+    df_cleaned = df_cleaned.apply(lambda col: col.map(wrap_text))
+    table_out = df_cleaned.to_markdown(index=False)
+    return table_out
+def ensure_output_folder_exists():
+    """Checks if the 'output/' folder exists, creates it if not."""
+    folder_name = "output/"
+    if not os.path.exists(folder_name):
+        # Create the folder if it doesn't exist
+        os.makedirs(folder_name)
+        print("Created the 'output/' folder.")
+    else:
+        print("The 'output/' folder already exists.")
+def put_columns_in_df(in_file: List[str]):
+    new_choices = list()
+    concat_choices = list()
+    all_sheet_names = list()
+    number_of_excel_files = 0
+    if not in_file:
+        return (
+            gr.Dropdown(choices=list()),
+            gr.Dropdown(choices=list()),
+            "",
+            gr.Dropdown(choices=list()),
+            gr.Dropdown(choices=list()),
+        )
+    for file in in_file:
+        file_name = file.name
+        file_type = detect_file_type(file_name)
+        # print("File type is:", file_type)
+        file_end = get_file_path_with_extension(file_name)
+        if file_type == "xlsx":
+            number_of_excel_files += 1
+            new_choices = list()
+            print("Running through all xlsx sheets")
+            anon_xlsx = pd.ExcelFile(file_name)
+            new_sheet_names = anon_xlsx.sheet_names
+            # Iterate through the sheet names
+            for sheet_name in new_sheet_names:
+                # Read each sheet into a DataFrame
+                df = pd.read_excel(file_name, sheet_name=sheet_name)
+                new_choices.extend(list(df.columns))
+            all_sheet_names.extend(new_sheet_names)
+        else:
+            df = read_file(file_name)
+            new_choices = list(df.columns)
+        concat_choices.extend(new_choices)
+    # Drop duplicate columns
+    concat_choices = sorted(set(concat_choices))
+    if number_of_excel_files > 0:
+        return (
+            gr.Dropdown(choices=concat_choices, value=concat_choices[0]),
+            gr.Dropdown(
+                choices=all_sheet_names,
+                value=all_sheet_names[0],
+                visible=True,
+                interactive=True,
+            ),
+            file_end,
+            gr.Dropdown(choices=concat_choices),
+            gr.Dropdown(choices=concat_choices),
+        )
+    else:
+        return (
+            gr.Dropdown(choices=concat_choices, value=concat_choices[0]),
+            gr.Dropdown(visible=False),
+            file_end,
+            gr.Dropdown(choices=concat_choices),
+            gr.Dropdown(choices=concat_choices),
+        )
+# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+def add_folder_to_path(folder_path: str):
+    """
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
+    """
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+# Upon running a process, the feedback buttons are revealed
+def reveal_feedback_buttons():
+    return (
+        gr.Radio(visible=True),
+        gr.Textbox(visible=True),
+        gr.Button(visible=True),
+        gr.Markdown(visible=True),
+    )
+def wipe_logs(feedback_logs_loc: str, usage_logs_loc: str):
+    try:
+        os.remove(feedback_logs_loc)
+    except Exception as e:
+        print("Could not remove feedback logs file", e)
+    try:
+        os.remove(usage_logs_loc)
+    except Exception as e:
+        print("Could not remove usage logs file", e)
+async def get_connection_params(
+    request: gr.Request,
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: str = SESSION_OUTPUT_FOLDER,
+):
+    # print("Session hash:", request.session_hash)
+    if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
+        if CUSTOM_HEADER in request.headers:
+            supplied_custom_header_value = request.headers[CUSTOM_HEADER]
+            if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
+                print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
+            else:
+                print("Custom header value does not match expected value.")
+                raise ValueError("Custom header value does not match expected value.")
+        else:
+            print("Custom header value not found.")
+            raise ValueError("Custom header value not found.")
+    # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+    if request.username:
+        out_session_hash = request.username
+        # print("Request username found:", out_session_hash)
+    elif "x-cognito-id" in request.headers:
+        out_session_hash = request.headers["x-cognito-id"]
+        # print("Cognito ID found:", out_session_hash)
+    elif "x-amzn-oidc-identity" in request.headers:
+        out_session_hash = request.headers["x-amzn-oidc-identity"]
+        # Fetch email address using Cognito client
+        cognito_client = boto3.client("cognito-idp")
+        try:
+            response = cognito_client.admin_get_user(
+                UserPoolId=AWS_USER_POOL_ID,  # Replace with your User Pool ID
+                Username=out_session_hash,
+            )
+            email = next(
+                attr["Value"]
+                for attr in response["UserAttributes"]
+                if attr["Name"] == "email"
+            )
+            # print("Email address found:", email)
+            out_session_hash = email
+        except ClientError as e:
+            print("Error fetching user details:", e)
+            email = None
+        print("Cognito ID found:", out_session_hash)
+    else:
+        out_session_hash = request.session_hash
+    if session_output_folder == "True" or session_output_folder is True:
+        output_folder = output_folder_textbox + out_session_hash + "/"
+        input_folder = input_folder_textbox + out_session_hash + "/"
+    else:
+        output_folder = output_folder_textbox
+        input_folder = input_folder_textbox
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
+    return out_session_hash, output_folder, out_session_hash, input_folder
+def load_in_default_cost_codes(cost_codes_path: str, default_cost_code: str = ""):
+    """
+    Load in the cost codes list from file.
+    """
+    cost_codes_df = pd.read_csv(cost_codes_path)
+    dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
+    # Avoid inserting duplicate or empty cost code values
+    if default_cost_code and default_cost_code not in dropdown_choices:
+        dropdown_choices.insert(0, default_cost_code)
+    # Always have a blank option at the top
+    if "" not in dropdown_choices:
+        dropdown_choices.insert(0, "")
+    out_dropdown = gr.Dropdown(
+        value=default_cost_code if default_cost_code in dropdown_choices else "",
+        label="Choose cost code for analysis",
+        choices=dropdown_choices,
+        allow_custom_value=False,
+    )
+    return cost_codes_df, cost_codes_df, out_dropdown
+def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
+    row_value_code = evt.row_value[0]  # This is the value for cost code
+    return row_value_code
+def update_cost_code_dataframe_from_dropdown_select(
+    cost_dropdown_selection: str, cost_code_df: pd.DataFrame
+):
+    cost_code_df = cost_code_df.loc[
+        cost_code_df.iloc[:, 0] == cost_dropdown_selection, :
+    ]
+    return cost_code_df
+def reset_base_dataframe(df: pd.DataFrame):
+    return df
+def enforce_cost_codes(
+    enforce_cost_code_textbox: str,
+    cost_code_choice: str,
+    cost_code_df: pd.DataFrame,
+    verify_cost_codes: bool = True,
+):
+    """
+    Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
+    """
+    if enforce_cost_code_textbox == "True":
+        if not cost_code_choice:
+            raise Exception("Please choose a cost code before continuing")
+        if verify_cost_codes is True:
+            if cost_code_df.empty:
+                # Warn but don't block - cost code is still required above
+                print(
+                    "Warning: Cost code dataframe is empty. Verification skipped. Please ensure cost codes are loaded for full validation."
+                )
+            else:
+                valid_cost_codes_list = list(cost_code_df.iloc[:, 0].unique())
+                if cost_code_choice not in valid_cost_codes_list:
+                    raise Exception(
+                        "Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions."
+                    )
+    return
+def _get_env_list(env_var_name: str, strip_strings: bool = True) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    if strip_strings:
+        return [s.strip() for s in value.split(",") if s.strip()]
+    else:
+        return [codecs.decode(s, "unicode_escape") for s in value.split(",") if s]
+def create_batch_file_path_details(
+    reference_data_file_name: str,
+    latest_batch_completed: int = None,
+    batch_size_number: int = None,
+    in_column: str = None,
+) -> str:
+    """
+    Creates a standardised batch file path detail string from a reference data filename.
+    Args:
+        reference_data_file_name (str): Name of the reference data file
+        latest_batch_completed (int, optional): Latest batch completed. Defaults to None.
+        batch_size_number (int, optional): Batch size number. Defaults to None.
+        in_column (str, optional): In column. Defaults to None.
+    Returns:
+        str: Formatted batch file path detail string
+    """
+    # Extract components from filename using regex
+    file_name = (
+        re.search(
+            r"(.*?)(?:_all_|_final_|_batch_|_col_)", reference_data_file_name
+        ).group(1)
+        if re.search(r"(.*?)(?:_all_|_final_|_batch_|_col_)", reference_data_file_name)
+        else reference_data_file_name
+    )
+    latest_batch_completed = (
+        int(re.search(r"batch_(\d+)_", reference_data_file_name).group(1))
+        if "batch_" in reference_data_file_name
+        else latest_batch_completed
+    )
+    batch_size_number = (
+        int(re.search(r"size_(\d+)_", reference_data_file_name).group(1))
+        if "size_" in reference_data_file_name
+        else batch_size_number
+    )
+    in_column = (
+        re.search(r"col_(.*?)_reference", reference_data_file_name).group(1)
+        if "col_" in reference_data_file_name
+        else in_column
+    )
+    # Clean the extracted names
+    file_name_cleaned = clean_column_name(file_name, max_length=20)
+    in_column_cleaned = clean_column_name(in_column, max_length=20)
+    # Create batch file path details string
+    if latest_batch_completed:
+        return f"{file_name_cleaned}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
+    return f"{file_name_cleaned}_col_{in_column_cleaned}"
+def move_overall_summary_output_files_to_front_page(
+    overall_summary_output_files_xlsx: List[str],
+):
+    return overall_summary_output_files_xlsx
+def generate_zero_shot_topics_df(
+    zero_shot_topics: pd.DataFrame,
+    force_zero_shot_radio: str = "No",
+    create_revised_general_topics: bool = False,
+    max_topic_no: int = MAXIMUM_ZERO_SHOT_TOPICS,
+):
+    """
+    Preprocesses a DataFrame of zero-shot topics, cleaning and formatting them
+    for use with a large language model. It handles different column configurations
+    (e.g., only subtopics, general topics and subtopics, or subtopics with descriptions)
+    and enforces a maximum number of topics.
+    Args:
+        zero_shot_topics (pd.DataFrame): A DataFrame containing the initial zero-shot topics.
+                                         Expected columns can vary, but typically include
+                                         "General topic", "Subtopic", and/or "Description".
+        force_zero_shot_radio (str, optional): A string indicating whether to force
+                                               the use of zero-shot topics. Defaults to "No".
+                                               (Currently not used in the function logic, but kept for signature consistency).
+        create_revised_general_topics (bool, optional): A boolean indicating whether to
+                                                        create revised general topics. Defaults to False.
+                                                        (Currently not used in the function logic, but kept for signature consistency).
+        max_topic_no (int, optional): The maximum number of topics allowed to fit within
+                                      LLM context limits. If `zero_shot_topics` exceeds this,
+                                      it will be truncated. Defaults to 120.
+    Returns:
+        tuple: A tuple containing:
+            - zero_shot_topics_gen_topics_list (list): A list of cleaned general topics.
+            - zero_shot_topics_subtopics_list (list): A list of cleaned subtopics.
+            - zero_shot_topics_description_list (list): A list of cleaned topic descriptions.
+    """
+    zero_shot_topics_gen_topics_list = list()
+    zero_shot_topics_subtopics_list = list()
+    zero_shot_topics_description_list = list()
+    # Max 120 topics allowed
+    if zero_shot_topics.shape[0] > max_topic_no:
+        out_message = (
+            "Maximum "
+            + str(max_topic_no)
+            + " zero-shot topics allowed according to application configuration."
+        )
+        print(out_message)
+        raise Exception(out_message)
+    # Forward slashes in the topic names seems to confuse the model
+    if zero_shot_topics.shape[1] >= 1:  # Check if there is at least one column
+        for x in zero_shot_topics.columns:
+            if not zero_shot_topics[x].isnull().all():
+                zero_shot_topics[x] = zero_shot_topics[x].apply(initial_clean)
+                zero_shot_topics.loc[:, x] = (
+                    zero_shot_topics.loc[:, x]
+                    .str.strip()
+                    .str.replace("\n", " ")
+                    .str.replace("\r", " ")
+                    .str.replace("/", " or ")
+                    .str.replace("&", " and ")
+                    .str.replace(" s ", "s ")
+                    .str.lower()
+                    .str.capitalize()
+                )
+        # If number of columns is 1, keep only subtopics
+        if (
+            zero_shot_topics.shape[1] == 1
+            and "General topic" not in zero_shot_topics.columns
+        ):
+            print("Found only Subtopic in zero shot topics")
+            zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
+        # Allow for possibility that the user only wants to set general topics and not subtopics
+        elif (
+            zero_shot_topics.shape[1] == 1
+            and "General topic" in zero_shot_topics.columns
+        ):
+            print("Found only General topic in zero shot topics")
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics["General topic"])
+            zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
+        # If general topic and subtopic are specified
+        elif set(["General topic", "Subtopic"]).issubset(zero_shot_topics.columns):
+            print("Found General topic and Subtopic in zero shot topics")
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics["General topic"])
+            zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
+        # If subtopic and description are specified
+        elif set(["Subtopic", "Description"]).issubset(zero_shot_topics.columns):
+            print("Found Subtopic and Description in zero shot topics")
+            zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
+            zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
+            zero_shot_topics_description_list = list(zero_shot_topics["Description"])
+        # If number of columns is at least 2, keep general topics and subtopics
+        elif (
+            zero_shot_topics.shape[1] >= 2
+            and "Description" not in zero_shot_topics.columns
+        ):
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
+        else:
+            # If there are more columns, just assume that the first column was meant to be a subtopic
+            zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
+        # Add a description if column is present
+        if not zero_shot_topics_description_list:
+            if "Description" in zero_shot_topics.columns:
+                zero_shot_topics_description_list = list(
+                    zero_shot_topics["Description"]
+                )
+                # print("Description found in topic title. List is:", zero_shot_topics_description_list)
+            elif zero_shot_topics.shape[1] >= 3:
+                zero_shot_topics_description_list = list(
+                    zero_shot_topics.iloc[:, 2]
+                )  # Assume the third column is description
+            else:
+                zero_shot_topics_description_list = [""] * zero_shot_topics.shape[0]
+        # If the responses are being forced into zero shot topics, allow an option for nothing relevant
+        if force_zero_shot_radio == "Yes":
+            zero_shot_topics_gen_topics_list.append("")
+            zero_shot_topics_subtopics_list.append("No relevant topic")
+            zero_shot_topics_description_list.append("")
+        # Add description or not
+        zero_shot_topics_df = pd.DataFrame(
+            data={
+                "General topic": zero_shot_topics_gen_topics_list,
+                "Subtopic": zero_shot_topics_subtopics_list,
+                "Description": zero_shot_topics_description_list,
+            }
+        )
+        # Filter out duplicate General topic and subtopic names
+        zero_shot_topics_df = zero_shot_topics_df.drop_duplicates(
+            ["General topic", "Subtopic"], keep="first"
+        )
+        # Sort the dataframe by General topic and subtopic
+        zero_shot_topics_df = zero_shot_topics_df.sort_values(
+            ["General topic", "Subtopic"], ascending=[True, True]
+        )
+        return zero_shot_topics_df
+def update_model_choice(model_source):
+    # Filter models by source and return the first matching model name
+    matching_models = [
+        model_name
+        for model_name, model_info in model_name_map.items()
+        if model_info["source"] == model_source
+    ]
+    output_model = matching_models[0] if matching_models else model_full_names[0]
+    return gr.Dropdown(
+        value=output_model,
+        choices=matching_models,
+        label="Large language model for topic extraction and summarisation",
+        multiselect=False,
+    )
+def ensure_model_in_map(model_choice: str, model_name_map_dict: dict = None) -> dict:
+    """
+    Ensures that a model_choice is registered in model_name_map.
+    If the model_choice is not found, it assumes it's an inference-server model
+    and adds it to the map with source "inference-server".
+    Args:
+        model_choice (str): The model name to check/register
+        model_name_map_dict (dict, optional): The model_name_map dictionary to update.
+            If None, uses the global model_name_map from config.
+    Returns:
+        dict: The model_name_map dictionary (updated if needed)
+    """
+    # Use provided dict or global one
+    if model_name_map_dict is None:
+        from tools.config import model_name_map
+        model_name_map_dict = model_name_map
+    # If model_choice is not in the map, assume it's an inference-server model
+    if model_choice not in model_name_map_dict:
+        model_name_map_dict[model_choice] = {
+            "short_name": model_choice,
+            "source": "inference-server",
+        }
+        print(f"Registered custom model '{model_choice}' as inference-server model")
+    return model_name_map_dict

tools/llm_api_call.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tools/llm_funcs.py ADDED Viewed

	@@ -0,0 +1,1999 @@

+import json
+import os
+import re
+import time
+from typing import List, Tuple
+import boto3
+import pandas as pd
+import requests
+# Import mock patches if in test mode
+if os.environ.get("USE_MOCK_LLM") == "1" or os.environ.get("TEST_MODE") == "1":
+    try:
+        # Try to import and apply mock patches
+        import sys
+        # Add project root to sys.path so we can import test.mock_llm_calls
+        project_root = os.path.dirname(os.path.dirname(__file__))
+        if project_root not in sys.path:
+            sys.path.insert(0, project_root)
+        try:
+            from test.mock_llm_calls import apply_mock_patches
+            apply_mock_patches()
+        except ImportError:
+            # If mock module not found, continue without mocking
+            pass
+    except Exception:
+        # If anything fails, continue without mocking
+        pass
+from google import genai as ai
+from google.genai import types
+from gradio import Progress
+from huggingface_hub import hf_hub_download
+from openai import OpenAI
+from tqdm import tqdm
+model_type = None  # global variable setup
+full_text = (
+    ""  # Define dummy source text (full text) just to enable highlight function to load
+)
+# Global variables for model and tokenizer
+_model = None
+_tokenizer = None
+_assistant_model = None
+from tools.config import (
+    ASSISTANT_MODEL,
+    BATCH_SIZE_DEFAULT,
+    CHOSEN_LOCAL_MODEL_TYPE,
+    COMPILE_MODE,
+    COMPILE_TRANSFORMERS,
+    DEDUPLICATION_THRESHOLD,
+    HF_TOKEN,
+    INT8_WITH_OFFLOAD_TO_CPU,
+    K_QUANT_LEVEL,
+    LLM_BATCH_SIZE,
+    LLM_CONTEXT_LENGTH,
+    LLM_LAST_N_TOKENS,
+    LLM_MAX_GPU_LAYERS,
+    LLM_MAX_NEW_TOKENS,
+    LLM_MIN_P,
+    LLM_REPETITION_PENALTY,
+    LLM_RESET,
+    LLM_SAMPLE,
+    LLM_SEED,
+    LLM_STOP_STRINGS,
+    LLM_STREAM,
+    LLM_TEMPERATURE,
+    LLM_THREADS,
+    LLM_TOP_K,
+    LLM_TOP_P,
+    LOAD_LOCAL_MODEL_AT_START,
+    LOCAL_MODEL_FILE,
+    LOCAL_MODEL_FOLDER,
+    LOCAL_REPO_ID,
+    MAX_COMMENT_CHARS,
+    MAX_TIME_FOR_LOOP,
+    MODEL_DTYPE,
+    MULTIMODAL_PROMPT_FORMAT,
+    NUM_PRED_TOKENS,
+    NUMBER_OF_RETRY_ATTEMPTS,
+    RUN_LOCAL_MODEL,
+    SPECULATIVE_DECODING,
+    TIMEOUT_WAIT,
+    USE_BITSANDBYTES,
+    USE_LLAMA_CPP,
+    USE_LLAMA_SWAP,
+    V_QUANT_LEVEL,
+)
+from tools.helper_functions import _get_env_list
+if SPECULATIVE_DECODING == "True":
+    SPECULATIVE_DECODING = True
+else:
+    SPECULATIVE_DECODING = False
+if isinstance(NUM_PRED_TOKENS, str):
+    NUM_PRED_TOKENS = int(NUM_PRED_TOKENS)
+if isinstance(LLM_MAX_GPU_LAYERS, str):
+    LLM_MAX_GPU_LAYERS = int(LLM_MAX_GPU_LAYERS)
+if isinstance(LLM_THREADS, str):
+    LLM_THREADS = int(LLM_THREADS)
+if LLM_RESET == "True":
+    reset = True
+else:
+    reset = False
+if LLM_STREAM == "True":
+    stream = True
+else:
+    stream = False
+if LLM_SAMPLE == "True":
+    sample = True
+else:
+    sample = False
+if LLM_STOP_STRINGS:
+    LLM_STOP_STRINGS = _get_env_list(LLM_STOP_STRINGS, strip_strings=False)
+max_tokens = LLM_MAX_NEW_TOKENS
+timeout_wait = TIMEOUT_WAIT
+number_of_api_retry_attempts = NUMBER_OF_RETRY_ATTEMPTS
+max_time_for_loop = MAX_TIME_FOR_LOOP
+batch_size_default = BATCH_SIZE_DEFAULT
+deduplication_threshold = DEDUPLICATION_THRESHOLD
+max_comment_character_length = MAX_COMMENT_CHARS
+temperature = LLM_TEMPERATURE
+top_k = LLM_TOP_K
+top_p = LLM_TOP_P
+min_p = LLM_MIN_P
+repetition_penalty = LLM_REPETITION_PENALTY
+last_n_tokens = LLM_LAST_N_TOKENS
+LLM_MAX_NEW_TOKENS: int = LLM_MAX_NEW_TOKENS
+seed: int = LLM_SEED
+reset: bool = reset
+stream: bool = stream
+batch_size: int = LLM_BATCH_SIZE
+context_length: int = LLM_CONTEXT_LENGTH
+sample = LLM_SAMPLE
+stop_strings = LLM_STOP_STRINGS
+speculative_decoding = SPECULATIVE_DECODING
+if LLM_MAX_GPU_LAYERS != 0:
+    gpu_layers = int(LLM_MAX_GPU_LAYERS)
+    torch_device = "cuda"
+else:
+    gpu_layers = 0
+    torch_device = "cpu"
+if not LLM_THREADS:
+    threads = 1
+else:
+    threads = LLM_THREADS
+class llama_cpp_init_config_gpu:
+    def __init__(
+        self,
+        last_n_tokens=last_n_tokens,
+        seed=seed,
+        n_threads=threads,
+        n_batch=batch_size,
+        n_ctx=context_length,
+        n_gpu_layers=gpu_layers,
+        reset=reset,
+    ):
+        self.last_n_tokens = last_n_tokens
+        self.seed = seed
+        self.n_threads = n_threads
+        self.n_batch = n_batch
+        self.n_ctx = n_ctx
+        self.n_gpu_layers = n_gpu_layers
+        self.reset = reset
+        # self.stop: list[str] = field(default_factory=lambda: [stop_string])
+    def update_gpu(self, new_value):
+        self.n_gpu_layers = new_value
+    def update_context(self, new_value):
+        self.n_ctx = new_value
+class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
+    def __init__(self):
+        super().__init__()
+        self.n_gpu_layers = gpu_layers
+        self.n_ctx = context_length
+gpu_config = llama_cpp_init_config_gpu()
+cpu_config = llama_cpp_init_config_cpu()
+class LlamaCPPGenerationConfig:
+    def __init__(
+        self,
+        temperature=temperature,
+        top_k=top_k,
+        min_p=min_p,
+        top_p=top_p,
+        repeat_penalty=repetition_penalty,
+        seed=seed,
+        stream=stream,
+        max_tokens=LLM_MAX_NEW_TOKENS,
+        reset=reset,
+    ):
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.repeat_penalty = repeat_penalty
+        self.seed = seed
+        self.max_tokens = max_tokens
+        self.stream = stream
+        self.reset = reset
+    def update_temp(self, new_value):
+        self.temperature = new_value
+# ResponseObject class for AWS Bedrock calls
+class ResponseObject:
+    def __init__(self, text, usage_metadata):
+        self.text = text
+        self.usage_metadata = usage_metadata
+###
+# LOCAL MODEL FUNCTIONS
+###
+def get_model_path(
+    repo_id=LOCAL_REPO_ID,
+    model_filename=LOCAL_MODEL_FILE,
+    model_dir=LOCAL_MODEL_FOLDER,
+    hf_token=HF_TOKEN,
+):
+    # Construct the expected local path
+    local_path = os.path.join(model_dir, model_filename)
+    print("local path for model load:", local_path)
+    try:
+        if os.path.exists(local_path):
+            print(f"Model already exists at: {local_path}")
+            return local_path
+        else:
+            if hf_token:
+                print("Downloading model from Hugging Face Hub with HF token")
+                downloaded_model_path = hf_hub_download(
+                    repo_id=repo_id, token=hf_token, filename=model_filename
+                )
+                return downloaded_model_path
+            else:
+                print(
+                    "No HF token found, downloading model from Hugging Face Hub without token"
+                )
+                downloaded_model_path = hf_hub_download(
+                    repo_id=repo_id, filename=model_filename
+                )
+                return downloaded_model_path
+    except Exception as e:
+        print("Error loading model:", e)
+        raise Warning("Error loading model:", e)
+def load_model(
+    local_model_type: str = CHOSEN_LOCAL_MODEL_TYPE,
+    gpu_layers: int = gpu_layers,
+    max_context_length: int = context_length,
+    gpu_config: llama_cpp_init_config_gpu = gpu_config,
+    cpu_config: llama_cpp_init_config_cpu = cpu_config,
+    torch_device: str = torch_device,
+    repo_id=LOCAL_REPO_ID,
+    model_filename=LOCAL_MODEL_FILE,
+    model_dir=LOCAL_MODEL_FOLDER,
+    compile_mode=COMPILE_MODE,
+    model_dtype=MODEL_DTYPE,
+    hf_token=HF_TOKEN,
+    speculative_decoding=speculative_decoding,
+    model=None,
+    tokenizer=None,
+    assistant_model=None,
+):
+    """
+    Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
+    Args:
+        local_model_type (str): The type of local model to load (e.g., "llama-cpp").
+        gpu_layers (int): The number of GPU layers to offload to the GPU.
+        max_context_length (int): The maximum context length for the model.
+        gpu_config (llama_cpp_init_config_gpu): Configuration object for GPU-specific Llama.cpp parameters.
+        cpu_config (llama_cpp_init_config_cpu): Configuration object for CPU-specific Llama.cpp parameters.
+        torch_device (str): The device to load the model on ("cuda" for GPU, "cpu" for CPU).
+        repo_id (str): The Hugging Face repository ID where the model is located.
+        model_filename (str): The specific filename of the model to download from the repository.
+        model_dir (str): The local directory where the model will be stored or downloaded.
+        compile_mode (str): The compilation mode to use for the model.
+        model_dtype (str): The data type to use for the model.
+        hf_token (str): The Hugging Face token to use for the model.
+        speculative_decoding (bool): Whether to use speculative decoding.
+        model (Llama/transformers model): The model to load.
+        tokenizer (list/transformers tokenizer): The tokenizer to load.
+        assistant_model (transformers model): The assistant model for speculative decoding.
+    Returns:
+        tuple: A tuple containing:
+            - model (Llama/transformers model): The loaded Llama.cpp/transformers model instance.
+            - tokenizer (list/transformers tokenizer): An empty list (tokenizer is not used with Llama.cpp directly in this setup), or a transformers tokenizer.
+            - assistant_model (transformers model): The assistant model for speculative decoding (if speculative_decoding is True).
+    """
+    if model:
+        return model, tokenizer, assistant_model
+    print("Loading model:", local_model_type)
+    # Verify the device and cuda settings
+    # Check if CUDA is enabled
+    import torch
+    torch.cuda.empty_cache()
+    print("Is CUDA enabled? ", torch.cuda.is_available())
+    print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled)
+    if torch.cuda.is_available():
+        torch_device = "cuda"
+        gpu_layers = int(LLM_MAX_GPU_LAYERS)
+        print("CUDA version:", torch.version.cuda)
+        # try:
+        #    os.system("nvidia-smi")
+        # except Exception as e:
+        #    print("Could not print nvidia-smi settings due to:", e)
+    else:
+        torch_device = "cpu"
+        gpu_layers = 0
+    print("Running on device:", torch_device)
+    print("GPU layers assigned to cuda:", gpu_layers)
+    if not LLM_THREADS:
+        threads = torch.get_num_threads()
+    else:
+        threads = LLM_THREADS
+    print("CPU threads:", threads)
+    # GPU mode
+    if torch_device == "cuda":
+        torch.cuda.empty_cache()
+        gpu_config.update_gpu(gpu_layers)
+        gpu_config.update_context(max_context_length)
+        if USE_LLAMA_CPP == "True":
+            from llama_cpp import Llama
+            from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+            model_path = get_model_path(
+                repo_id=repo_id, model_filename=model_filename, model_dir=model_dir
+            )
+            try:
+                print("GPU load variables:", vars(gpu_config))
+                if speculative_decoding:
+                    model = Llama(
+                        model_path=model_path,
+                        type_k=K_QUANT_LEVEL,
+                        type_v=V_QUANT_LEVEL,
+                        flash_attn=True,
+                        draft_model=LlamaPromptLookupDecoding(
+                            num_pred_tokens=NUM_PRED_TOKENS
+                        ),
+                        **vars(gpu_config),
+                    )
+                else:
+                    model = Llama(
+                        model_path=model_path,
+                        type_k=K_QUANT_LEVEL,
+                        type_v=V_QUANT_LEVEL,
+                        flash_attn=True,
+                        **vars(gpu_config),
+                    )
+            except Exception as e:
+                print("GPU load failed due to:", e, "Loading model in CPU mode")
+                # If fails, go to CPU mode
+                model = Llama(model_path=model_path, **vars(cpu_config))
+        else:
+            from transformers import (
+                AutoModelForCausalLM,
+                BitsAndBytesConfig,
+            )
+            from unsloth import FastLanguageModel
+            print("Loading model from transformers")
+            # Use the official model ID for Gemma 3 4B
+            model_id = (
+                repo_id.split("https://huggingface.co/")[-1]
+                if "https://huggingface.co/" in repo_id
+                else repo_id
+            )
+            # 1. Set Data Type (dtype)
+            # For H200/Hopper: 'bfloat16'
+            # For RTX 3060/Ampere: 'float16'
+            dtype_str = model_dtype  # os.environ.get("MODEL_DTYPE", "bfloat16").lower()
+            if dtype_str == "bfloat16":
+                torch_dtype = torch.bfloat16
+            elif dtype_str == "float16":
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32  # A safe fallback
+            # 2. Set Compilation Mode
+            # 'max-autotune' is great for both but can be slow initially.
+            # 'reduce-overhead' is a faster alternative for compiling.
+            print("--- System Configuration ---")
+            print(f"Using model id: {model_id}")
+            print(f"Using dtype: {torch_dtype}")
+            print(f"Using compile mode: {compile_mode}")
+            print(f"Using bitsandbytes: {USE_BITSANDBYTES}")
+            print("--------------------------\n")
+            # --- Load Tokenizer and Model ---
+            try:
+                # Load Tokenizer and Model
+                # tokenizer = AutoTokenizer.from_pretrained(model_id)
+                if USE_BITSANDBYTES == "True":
+                    if INT8_WITH_OFFLOAD_TO_CPU == "True":
+                        # This will be very slow. Requires at least 4GB of VRAM and 32GB of RAM
+                        print(
+                            "Using bitsandbytes for quantisation to 8 bits, with offloading to CPU"
+                        )
+                        max_memory = {0: "4GB", "cpu": "32GB"}
+                        BitsAndBytesConfig(
+                            load_in_8bit=True,
+                            max_memory=max_memory,
+                            llm_int8_enable_fp32_cpu_offload=True,  # Note: if bitsandbytes has to offload to CPU, inference will be slow
+                        )
+                    else:
+                        # For Gemma 4B, requires at least 6GB of VRAM
+                        print("Using bitsandbytes for quantisation to 4 bits")
+                        BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_quant_type="nf4",  # Use the modern NF4 quantisation for better performance
+                            bnb_4bit_compute_dtype=torch_dtype,
+                            bnb_4bit_use_double_quant=True,  # Optional: uses a second quantisation step to save even more memory
+                        )
+                    # print("Loading model with bitsandbytes quantisation config:", quantisation_config)
+                    model, tokenizer = FastLanguageModel.from_pretrained(
+                        model_id,
+                        max_seq_length=max_context_length,
+                        dtype=torch_dtype,
+                        device_map="auto",
+                        load_in_4bit=True,
+                        # quantization_config=quantisation_config, # Not actually used in Unsloth
+                        token=hf_token,
+                    )
+                    FastLanguageModel.for_inference(model)
+                else:
+                    print("Loading model without bitsandbytes quantisation")
+                    model, tokenizer = FastLanguageModel.from_pretrained(
+                        model_id,
+                        max_seq_length=max_context_length,
+                        dtype=torch_dtype,
+                        device_map="auto",
+                        token=hf_token,
+                    )
+                    FastLanguageModel.for_inference(model)
+                if not tokenizer.pad_token:
+                    tokenizer.pad_token = tokenizer.eos_token
+            except Exception as e:
+                print("Error loading model with bitsandbytes quantisation config:", e)
+                raise Warning(
+                    "Error loading model with bitsandbytes quantisation config:", e
+                )
+            # Compile the Model with the selected mode 🚀
+            if COMPILE_TRANSFORMERS == "True":
+                try:
+                    model = torch.compile(model, mode=compile_mode, fullgraph=True)
+                except Exception as e:
+                    print(f"Could not compile model: {e}. Running in eager mode.")
+        print(
+            "Loading with",
+            gpu_config.n_gpu_layers,
+            "model layers sent to GPU and a maximum context length of",
+            gpu_config.n_ctx,
+        )
+    # CPU mode
+    else:
+        if USE_LLAMA_CPP == "False":
+            raise Warning(
+                "Using transformers model in CPU mode is not supported. Please change your config variable USE_LLAMA_CPP to True if you want to do CPU inference."
+            )
+        model_path = get_model_path(
+            repo_id=repo_id, model_filename=model_filename, model_dir=model_dir
+        )
+        # gpu_config.update_gpu(gpu_layers)
+        cpu_config.update_gpu(gpu_layers)
+        # Update context length according to slider
+        # gpu_config.update_context(max_context_length)
+        cpu_config.update_context(max_context_length)
+        if speculative_decoding:
+            model = Llama(
+                model_path=model_path,
+                draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS),
+                **vars(cpu_config),
+            )
+        else:
+            model = Llama(model_path=model_path, **vars(cpu_config))
+        print(
+            "Loading with",
+            cpu_config.n_gpu_layers,
+            "model layers sent to GPU and a maximum context length of",
+            cpu_config.n_ctx,
+        )
+    print("Finished loading model:", local_model_type)
+    print("GPU layers assigned to cuda:", gpu_layers)
+    # Load assistant model for speculative decoding if enabled
+    if speculative_decoding and USE_LLAMA_CPP == "False" and torch_device == "cuda":
+        print("Loading assistant model for speculative decoding:", ASSISTANT_MODEL)
+        try:
+            from transformers import AutoModelForCausalLM
+            # Load the assistant model with the same configuration as the main model
+            assistant_model = AutoModelForCausalLM.from_pretrained(
+                ASSISTANT_MODEL, dtype=torch_dtype, device_map="auto", token=hf_token
+            )
+            # assistant_model.config._name_or_path = model.config._name_or_path
+            # Compile the assistant model if compilation is enabled
+            if COMPILE_TRANSFORMERS == "True":
+                try:
+                    assistant_model = torch.compile(
+                        assistant_model, mode=compile_mode, fullgraph=True
+                    )
+                except Exception as e:
+                    print(
+                        f"Could not compile assistant model: {e}. Running in eager mode."
+                    )
+            print("Successfully loaded assistant model for speculative decoding")
+        except Exception as e:
+            print(f"Error loading assistant model: {e}")
+            assistant_model = None
+    else:
+        assistant_model = None
+    return model, tokenizer, assistant_model
+def get_model():
+    """Get the globally loaded model. Load it if not already loaded."""
+    global _model, _tokenizer, _assistant_model
+    if _model is None:
+        _model, _tokenizer, _assistant_model = load_model(
+            local_model_type=CHOSEN_LOCAL_MODEL_TYPE,
+            gpu_layers=gpu_layers,
+            max_context_length=context_length,
+            gpu_config=gpu_config,
+            cpu_config=cpu_config,
+            torch_device=torch_device,
+            repo_id=LOCAL_REPO_ID,
+            model_filename=LOCAL_MODEL_FILE,
+            model_dir=LOCAL_MODEL_FOLDER,
+            compile_mode=COMPILE_MODE,
+            model_dtype=MODEL_DTYPE,
+            hf_token=HF_TOKEN,
+            model=_model,
+            tokenizer=_tokenizer,
+            assistant_model=_assistant_model,
+        )
+    return _model
+def get_tokenizer():
+    """Get the globally loaded tokenizer. Load it if not already loaded."""
+    global _model, _tokenizer, _assistant_model
+    if _tokenizer is None:
+        _model, _tokenizer, _assistant_model = load_model(
+            local_model_type=CHOSEN_LOCAL_MODEL_TYPE,
+            gpu_layers=gpu_layers,
+            max_context_length=context_length,
+            gpu_config=gpu_config,
+            cpu_config=cpu_config,
+            torch_device=torch_device,
+            repo_id=LOCAL_REPO_ID,
+            model_filename=LOCAL_MODEL_FILE,
+            model_dir=LOCAL_MODEL_FOLDER,
+            compile_mode=COMPILE_MODE,
+            model_dtype=MODEL_DTYPE,
+            hf_token=HF_TOKEN,
+            model=_model,
+            tokenizer=_tokenizer,
+            assistant_model=_assistant_model,
+        )
+    return _tokenizer
+def get_assistant_model():
+    """Get the globally loaded assistant model. Load it if not already loaded."""
+    global _model, _tokenizer, _assistant_model
+    if _assistant_model is None:
+        _model, _tokenizer, _assistant_model = load_model(
+            local_model_type=CHOSEN_LOCAL_MODEL_TYPE,
+            gpu_layers=gpu_layers,
+            max_context_length=context_length,
+            gpu_config=gpu_config,
+            cpu_config=cpu_config,
+            torch_device=torch_device,
+            repo_id=LOCAL_REPO_ID,
+            model_filename=LOCAL_MODEL_FILE,
+            model_dir=LOCAL_MODEL_FOLDER,
+            compile_mode=COMPILE_MODE,
+            model_dtype=MODEL_DTYPE,
+            hf_token=HF_TOKEN,
+            model=_model,
+            tokenizer=_tokenizer,
+            assistant_model=_assistant_model,
+        )
+    return _assistant_model
+def set_model(model, tokenizer, assistant_model=None):
+    """Set the global model, tokenizer, and assistant model."""
+    global _model, _tokenizer, _assistant_model
+    _model = model
+    _tokenizer = tokenizer
+    _assistant_model = assistant_model
+# Initialize model at startup if configured
+if LOAD_LOCAL_MODEL_AT_START == "True" and RUN_LOCAL_MODEL == "1":
+    get_model()  # This will trigger loading
+def call_llama_cpp_model(formatted_string: str, gen_config: str, model=None):
+    """
+    Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
+    Args:
+        formatted_string (str): The formatted input text for the model.
+        gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
+        model: Optional model instance. If None, will use the globally loaded model.
+    """
+    if model is None:
+        model = get_model()
+    if model is None:
+        raise ValueError(
+            "No model available. Either pass a model parameter or ensure LOAD_LOCAL_MODEL_AT_START is True."
+        )
+    # Extracting parameters from the gen_config object
+    temperature = gen_config.temperature
+    top_k = gen_config.top_k
+    top_p = gen_config.top_p
+    repeat_penalty = gen_config.repeat_penalty
+    seed = gen_config.seed
+    max_tokens = gen_config.max_tokens
+    stream = gen_config.stream
+    # Now you can call your model directly, passing the parameters:
+    output = model(
+        formatted_string,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repeat_penalty=repeat_penalty,
+        seed=seed,
+        max_tokens=max_tokens,
+        stream=stream,  # ,
+        # stop=["<|eot_id|>", "\n\n"]
+    )
+    return output
+def call_llama_cpp_chatmodel(
+    formatted_string: str,
+    system_prompt: str,
+    gen_config: LlamaCPPGenerationConfig,
+    model=None,
+):
+    """
+    Calls your Llama.cpp chat model with a formatted user message and system prompt,
+    using generation parameters from the LlamaCPPGenerationConfig object.
+    Args:
+        formatted_string (str): The formatted input text for the user's message.
+        system_prompt (str): The system-level instructions for the model.
+        gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
+        model: Optional model instance. If None, will use the globally loaded model.
+    """
+    if model is None:
+        model = get_model()
+    if model is None:
+        raise ValueError(
+            "No model available. Either pass a model parameter or ensure LOAD_LOCAL_MODEL_AT_START is True."
+        )
+    # Extracting parameters from the gen_config object
+    temperature = gen_config.temperature
+    top_k = gen_config.top_k
+    top_p = gen_config.top_p
+    repeat_penalty = gen_config.repeat_penalty
+    seed = gen_config.seed
+    max_tokens = gen_config.max_tokens
+    stream = gen_config.stream
+    reset = gen_config.reset
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": formatted_string},
+    ]
+    input_tokens = len(
+        model.tokenize(
+            (system_prompt + "\n" + formatted_string).encode("utf-8"), special=True
+        )
+    )
+    if stream:
+        final_tokens = list()
+        output_tokens = 0
+        for chunk in model.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repeat_penalty=repeat_penalty,
+            seed=seed,
+            max_tokens=max_tokens,
+            stream=True,
+            stop=stop_strings,
+        ):
+            delta = chunk["choices"][0].get("delta", {})
+            token = delta.get("content") or chunk["choices"][0].get("text") or ""
+            if token:
+                print(token, end="", flush=True)
+                final_tokens.append(token)
+                output_tokens += 1
+        print()  # newline after stream finishes
+        text = "".join(final_tokens)
+        if reset:
+            model.reset()
+        return {
+            "choices": [
+                {
+                    "index": 0,
+                    "finish_reason": "stop",
+                    "message": {"role": "assistant", "content": text},
+                }
+            ],
+            # Provide a usage object so downstream code can read it
+            "usage": {
+                "prompt_tokens": input_tokens,  # unknown during streaming
+                "completion_tokens": output_tokens,  # unknown during streaming
+                "total_tokens": input_tokens
+                + output_tokens,  # unknown during streaming
+            },
+        }
+    else:
+        response = model.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repeat_penalty=repeat_penalty,
+            seed=seed,
+            max_tokens=max_tokens,
+            stream=False,
+            stop=stop_strings,
+        )
+        if reset:
+            model.reset()
+        return response
+def call_inference_server_api(
+    formatted_string: str,
+    system_prompt: str,
+    gen_config: LlamaCPPGenerationConfig,
+    api_url: str = "http://localhost:8080",
+    model_name: str = None,
+    use_llama_swap: bool = USE_LLAMA_SWAP,
+):
+    """
+    Calls a inference-server API endpoint with a formatted user message and system prompt,
+    using generation parameters from the LlamaCPPGenerationConfig object.
+    This function provides the same interface as call_llama_cpp_chatmodel but calls
+    a remote inference-server instance instead of a local model.
+    Args:
+        formatted_string (str): The formatted input text for the user's message.
+        system_prompt (str): The system-level instructions for the model.
+        gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
+        api_url (str): The base URL of the inference-server API (default: "http://localhost:8080").
+        model_name (str): Optional model name to use. If None, uses the default model.
+        use_llama_swap (bool): Whether to use llama-swap for the model.
+    Returns:
+        dict: Response in the same format as call_llama_cpp_chatmodel
+    Example:
+        # Create generation config
+        gen_config = LlamaCPPGenerationConfig(temperature=0.7, max_tokens=100)
+        # Call the API
+        response = call_inference_server_api(
+            formatted_string="Hello, how are you?",
+            system_prompt="You are a helpful assistant.",
+            gen_config=gen_config,
+            api_url="http://localhost:8080"
+        )
+        # Extract the response text
+        response_text = response['choices'][0]['message']['content']
+    Integration Example:
+        # To use inference-server instead of local model:
+        # 1. Set model_source to "inference-server"
+        # 2. Provide api_url parameter
+        # 3. Call your existing functions as normal
+        responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(
+            batch_prompts=["Your prompt here"],
+            system_prompt="Your system prompt",
+            conversation_history=[],
+            whole_conversation=[],
+            whole_conversation_metadata=[],
+            client=None,  # Not used for inference-server
+            client_config=None,  # Not used for inference-server
+            model_choice="your-model-name",  # Model name on the server
+            temperature=0.7,
+            reported_batch_no=1,
+            local_model=None,  # Not used for inference-server
+            tokenizer=None,  # Not used for inference-server
+            bedrock_runtime=None,  # Not used for inference-server
+            model_source="inference-server",
+            MAX_OUTPUT_VALIDATION_ATTEMPTS=3,
+            api_url="http://localhost:8080"
+        )
+    """
+    # Extract parameters from the gen_config object
+    temperature = gen_config.temperature
+    top_k = gen_config.top_k
+    top_p = gen_config.top_p
+    repeat_penalty = gen_config.repeat_penalty
+    seed = gen_config.seed
+    max_tokens = gen_config.max_tokens
+    stream = gen_config.stream
+    # Prepare the request payload
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": formatted_string},
+    ]
+    payload = {
+        "messages": messages,
+        "temperature": temperature,
+        "top_k": top_k,
+        "top_p": top_p,
+        "repeat_penalty": repeat_penalty,
+        "seed": seed,
+        "max_tokens": max_tokens,
+        "stream": stream,
+        "stop": LLM_STOP_STRINGS if LLM_STOP_STRINGS else [],
+    }
+    # Add model name if specified and use llama-swap
+    if model_name and use_llama_swap:
+        payload["model"] = model_name
+    # Determine the endpoint based on streaming preference
+    if stream:
+        endpoint = f"{api_url}/v1/chat/completions"
+    else:
+        endpoint = f"{api_url}/v1/chat/completions"
+    try:
+        if stream:
+            # Handle streaming response
+            response = requests.post(
+                endpoint,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                stream=True,
+                timeout=timeout_wait,
+            )
+            response.raise_for_status()
+            final_tokens = []
+            output_tokens = 0
+            for line in response.iter_lines():
+                if line:
+                    line = line.decode("utf-8")
+                    if line.startswith("data: "):
+                        data = line[6:]  # Remove 'data: ' prefix
+                        if data.strip() == "[DONE]":
+                            break
+                        try:
+                            chunk = json.loads(data)
+                            if "choices" in chunk and len(chunk["choices"]) > 0:
+                                delta = chunk["choices"][0].get("delta", {})
+                                token = delta.get("content", "")
+                                if token:
+                                    print(token, end="", flush=True)
+                                    final_tokens.append(token)
+                                    output_tokens += 1
+                        except json.JSONDecodeError:
+                            continue
+            print()  # newline after stream finishes
+            text = "".join(final_tokens)
+            # Estimate input tokens (rough approximation)
+            input_tokens = len((system_prompt + "\n" + formatted_string).split())
+            return {
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": "stop",
+                        "message": {"role": "assistant", "content": text},
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": input_tokens,
+                    "completion_tokens": output_tokens,
+                    "total_tokens": input_tokens + output_tokens,
+                },
+            }
+        else:
+            # Handle non-streaming response
+            response = requests.post(
+                endpoint,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=timeout_wait,
+            )
+            response.raise_for_status()
+            result = response.json()
+            # Ensure the response has the expected format
+            if "choices" not in result:
+                raise ValueError("Invalid response format from inference-server")
+            return result
+    except requests.exceptions.RequestException as e:
+        raise ConnectionError(
+            f"Failed to connect to inference-server at {api_url}: {str(e)}"
+        )
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON response from inference-server: {str(e)}")
+    except Exception as e:
+        raise RuntimeError(f"Error calling inference-server API: {str(e)}")
+###
+# LLM FUNCTIONS
+###
+def construct_gemini_generative_model(
+    in_api_key: str,
+    temperature: float,
+    model_choice: str,
+    system_prompt: str,
+    max_tokens: int,
+    random_seed=seed,
+) -> Tuple[object, dict]:
+    """
+    Constructs a GenerativeModel for Gemini API calls.
+    ...
+    """
+    # Construct a GenerativeModel
+    try:
+        if in_api_key:
+            # print("Getting API key from textbox")
+            api_key = in_api_key
+            client = ai.Client(api_key=api_key)
+        elif "GOOGLE_API_KEY" in os.environ:
+            # print("Searching for API key in environmental variables")
+            api_key = os.environ["GOOGLE_API_KEY"]
+            client = ai.Client(api_key=api_key)
+        else:
+            print("No Gemini API key found")
+            raise Warning("No Gemini API key found.")
+    except Exception as e:
+        print("Error constructing Gemini generative model:", e)
+        raise Warning("Error constructing Gemini generative model:", e)
+    config = types.GenerateContentConfig(
+        temperature=temperature, max_output_tokens=max_tokens, seed=random_seed
+    )
+    return client, config
+def construct_azure_client(in_api_key: str, endpoint: str) -> Tuple[object, dict]:
+    """
+    Constructs an OpenAI client for Azure/OpenAI AI Inference.
+    """
+    try:
+        key = None
+        if in_api_key:
+            key = in_api_key
+        elif os.environ.get("AZURE_OPENAI_API_KEY"):
+            key = os.environ["AZURE_OPENAI_API_KEY"]
+        if not key:
+            raise Warning("No Azure/OpenAI API key found.")
+        if not endpoint:
+            endpoint = os.environ.get("AZURE_OPENAI_INFERENCE_ENDPOINT", "")
+            if not endpoint:
+                # Assume using OpenAI API
+                client = OpenAI(
+                    api_key=key,
+                )
+            else:
+                # Use the provided endpoint
+                client = OpenAI(
+                    api_key=key,
+                    base_url=f"{endpoint}",
+                )
+        return client, dict()
+    except Exception as e:
+        print("Error constructing Azure/OpenAI client:", e)
+        raise
+def call_aws_bedrock(
+    prompt: str,
+    system_prompt: str,
+    temperature: float,
+    max_tokens: int,
+    model_choice: str,
+    bedrock_runtime: boto3.Session.client,
+    assistant_prefill: str = "",
+) -> ResponseObject:
+    """
+    This function sends a request to AWS Claude with the following parameters:
+    - prompt: The user's input prompt to be processed by the model.
+    - system_prompt: A system-defined prompt that provides context or instructions for the model.
+    - temperature: A value that controls the randomness of the model's output, with higher values resulting in more diverse responses.
+    - max_tokens: The maximum number of tokens (words or characters) in the model's response.
+    - model_choice: The specific model to use for processing the request.
+    - bedrock_runtime: The client object for boto3 Bedrock runtime
+    - assistant_prefill: A string indicating the text that the response should start with.
+    The function constructs the request configuration, invokes the model, extracts the response text, and returns a ResponseObject containing the text and metadata.
+    """
+    inference_config = {
+        "maxTokens": max_tokens,
+        "topP": 0.999,
+        "temperature": temperature,
+    }
+    # Using an assistant prefill only works for Anthropic models.
+    if assistant_prefill and "anthropic" in model_choice:
+        assistant_prefill_added = True
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"text": prompt},
+                ],
+            },
+            {
+                "role": "assistant",
+                # Pre-filling with '|'
+                "content": [{"text": assistant_prefill}],
+            },
+        ]
+    else:
+        assistant_prefill_added = False
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"text": prompt},
+                ],
+            }
+        ]
+    system_prompt_list = [{"text": system_prompt}]
+    # The converse API call.
+    api_response = bedrock_runtime.converse(
+        modelId=model_choice,
+        messages=messages,
+        system=system_prompt_list,
+        inferenceConfig=inference_config,
+    )
+    output_message = api_response["output"]["message"]
+    if "reasoningContent" in output_message["content"][0]:
+        # Extract the reasoning text
+        output_message["content"][0]["reasoningContent"]["reasoningText"]["text"]
+        # Extract the output text
+        if assistant_prefill_added:
+            text = assistant_prefill + output_message["content"][1]["text"]
+        else:
+            text = output_message["content"][1]["text"]
+    else:
+        if assistant_prefill_added:
+            text = assistant_prefill + output_message["content"][0]["text"]
+        else:
+            text = output_message["content"][0]["text"]
+    # The usage statistics are neatly provided in the 'usage' key.
+    usage = api_response["usage"]
+    # The full API response metadata is in 'ResponseMetadata' if you still need it.
+    api_response["ResponseMetadata"]
+    # Create ResponseObject with the cleanly extracted data.
+    response = ResponseObject(text=text, usage_metadata=usage)
+    return response
+def call_transformers_model(
+    prompt: str,
+    system_prompt: str,
+    gen_config: LlamaCPPGenerationConfig,
+    model=None,
+    tokenizer=None,
+    assistant_model=None,
+    speculative_decoding=speculative_decoding,
+):
+    """
+    This function sends a request to a transformers model (through Unsloth) with the given prompt, system prompt, and generation configuration.
+    """
+    from transformers import TextStreamer
+    if model is None:
+        model = get_model()
+    if tokenizer is None:
+        tokenizer = get_tokenizer()
+    if assistant_model is None and speculative_decoding:
+        assistant_model = get_assistant_model()
+    if model is None or tokenizer is None:
+        raise ValueError(
+            "No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True."
+        )
+    # 1. Define the conversation as a list of dictionaries
+    # Note: The multimodal format [{"type": "text", "text": text}] is only needed for actual multimodal models
+    # with images/videos. For text-only content, even multimodal models expect plain strings.
+    # Always use string format for text-only content, regardless of MULTIMODAL_PROMPT_FORMAT setting
+    # MULTIMODAL_PROMPT_FORMAT should only be used when you actually have multimodal inputs (images, etc.)
+    if MULTIMODAL_PROMPT_FORMAT == "True":
+        conversation = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": str(system_prompt)}],
+            },
+            {"role": "user", "content": [{"type": "text", "text": str(prompt)}]},
+        ]
+    else:
+        conversation = [
+            {"role": "system", "content": str(system_prompt)},
+            {"role": "user", "content": str(prompt)},
+        ]
+    # 2. Apply the chat template
+    try:
+        # Try applying chat template
+        input_ids = tokenizer.apply_chat_template(
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+        ).to("cuda")
+    except (TypeError, KeyError, IndexError) as e:
+        # If chat template fails, try manual formatting
+        print(f"Chat template failed ({e}), using manual tokenization")
+        # Combine system and user prompts manually
+        full_prompt = f"{system_prompt}\n\n{prompt}"
+        # Tokenize manually with special tokens
+        encoded = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=True)
+        if encoded is None:
+            raise ValueError(
+                "Tokenizer returned None - tokenizer may not be properly initialized"
+            )
+        if not hasattr(encoded, "input_ids") or encoded.input_ids is None:
+            raise ValueError("Tokenizer output does not contain input_ids")
+        input_ids = encoded.input_ids.to("cuda")
+    except Exception as e:
+        print("Error applying chat template:", e)
+        import traceback
+        traceback.print_exc()
+        raise
+    # Map LlamaCPP parameters to transformers parameters
+    generation_kwargs = {
+        "max_new_tokens": gen_config.max_tokens,
+        "temperature": gen_config.temperature,
+        "top_p": gen_config.top_p,
+        "top_k": gen_config.top_k,
+        "do_sample": True,
+        #'pad_token_id': tokenizer.eos_token_id
+    }
+    if gen_config.stream:
+        streamer = TextStreamer(tokenizer, skip_prompt=True)
+    else:
+        streamer = None
+    # Remove parameters that don't exist in transformers
+    if hasattr(gen_config, "repeat_penalty"):
+        generation_kwargs["repetition_penalty"] = gen_config.repeat_penalty
+    # --- Timed Inference Test ---
+    print("\nStarting model inference...")
+    start_time = time.time()
+    # Use speculative decoding if assistant model is available
+    try:
+        if speculative_decoding and assistant_model is not None:
+            # print("Using speculative decoding with assistant model")
+            outputs = model.generate(
+                input_ids,
+                assistant_model=assistant_model,
+                **generation_kwargs,
+                streamer=streamer,
+            )
+        else:
+            # print("Generating without speculative decoding")
+            outputs = model.generate(input_ids, **generation_kwargs, streamer=streamer)
+    except Exception as e:
+        error_msg = str(e)
+        # Check if this is a CUDA compilation error
+        if (
+            "sm_120" in error_msg
+            or "LLVM ERROR" in error_msg
+            or "Cannot select" in error_msg
+        ):
+            print("\n" + "=" * 80)
+            print("CUDA COMPILATION ERROR DETECTED")
+            print("=" * 80)
+            print(
+                "\nThe error is caused by torch.compile() trying to compile CUDA kernels"
+            )
+            print(
+                "with incompatible settings. This is a known issue with certain CUDA/PyTorch"
+            )
+            print("combinations.\n")
+            print(
+                "SOLUTION: Disable model compilation by setting COMPILE_TRANSFORMERS=False"
+            )
+            print("in your config file (config/app_config.env).")
+            print(
+                "\nThe model will still work without compilation, just slightly slower."
+            )
+            print("=" * 80 + "\n")
+            raise RuntimeError(
+                "CUDA compilation error detected. Please set COMPILE_TRANSFORMERS=False "
+                "in your config file to disable model compilation and avoid this error."
+            ) from e
+        else:
+            # Re-raise other errors as-is
+            raise
+    end_time = time.time()
+    # --- Decode and Display Results ---
+    new_tokens = outputs[0][input_ids.shape[-1] :]
+    assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    num_input_tokens = input_ids.shape[
+        -1
+    ]  # This gets the sequence length (number of tokens)
+    num_generated_tokens = len(new_tokens)
+    duration = end_time - start_time
+    tokens_per_second = num_generated_tokens / duration
+    print("\n--- Performance ---")
+    print(f"Time taken: {duration:.2f} seconds")
+    print(f"Generated tokens: {num_generated_tokens}")
+    print(f"Tokens per second: {tokens_per_second:.2f}")
+    return assistant_reply, num_input_tokens, num_generated_tokens
+# Function to send a request and update history
+def send_request(
+    prompt: str,
+    conversation_history: List[dict],
+    client: ai.Client | OpenAI,
+    config: types.GenerateContentConfig,
+    model_choice: str,
+    system_prompt: str,
+    temperature: float,
+    bedrock_runtime: boto3.Session.client,
+    model_source: str,
+    local_model=list(),
+    tokenizer=None,
+    assistant_model=None,
+    assistant_prefill="",
+    progress=Progress(track_tqdm=True),
+    api_url: str = None,
+) -> Tuple[str, List[dict]]:
+    """Sends a request to a language model and manages the conversation history.
+    This function constructs the full prompt by appending the new user prompt to the conversation history,
+    generates a response from the model, and updates the conversation history with the new prompt and response.
+    It handles different model sources (Gemini, AWS, Local, inference-server) and includes retry logic for API calls.
+    Args:
+        prompt (str): The user's input prompt to be sent to the model.
+        conversation_history (List[dict]): A list of dictionaries representing the ongoing conversation.
+                                           Each dictionary should have 'role' and 'parts' keys.
+        client (ai.Client): The API client object for the chosen model (e.g., Gemini `ai.Client`, or Azure/OpenAI `OpenAI`).
+        config (types.GenerateContentConfig): Configuration settings for content generation (e.g., Gemini `types.GenerateContentConfig`).
+        model_choice (str): The specific model identifier to use (e.g., "gemini-pro", "claude-v2").
+        system_prompt (str): An optional system-level instruction or context for the model.
+        temperature (float): Controls the randomness of the model's output, with higher values leading to more diverse responses.
+        bedrock_runtime (boto3.Session.client): The boto3 Bedrock runtime client object for AWS models.
+        model_source (str): Indicates the source/provider of the model (e.g., "Gemini", "AWS", "Local", "inference-server").
+        local_model (list, optional): A list containing the local model and its tokenizer (if `model_source` is "Local"). Defaults to [].
+        tokenizer (object, optional): The tokenizer object for local models. Defaults to None.
+        assistant_model (object, optional): An optional assistant model used for speculative decoding with local models. Defaults to None.
+        assistant_prefill (str, optional): A string to pre-fill the assistant's response, useful for certain models like Claude. Defaults to "".
+        progress (Progress, optional): A progress object for tracking the operation, typically from `tqdm`. Defaults to Progress(track_tqdm=True).
+        api_url (str, optional): The API URL for inference-server calls. Required when model_source is 'inference-server'.
+    Returns:
+        Tuple[str, List[dict]]: A tuple containing the model's response text and the updated conversation history.
+    """
+    # Constructing the full prompt from the conversation history
+    full_prompt = "Conversation history:\n"
+    num_transformer_input_tokens = 0
+    num_transformer_generated_tokens = 0
+    response_text = ""
+    for entry in conversation_history:
+        role = entry[
+            "role"
+        ].capitalize()  # Assuming the history is stored with 'role' and 'parts'
+        message = " ".join(entry["parts"])  # Combining all parts of the message
+        full_prompt += f"{role}: {message}\n"
+    # Adding the new user prompt
+    full_prompt += f"\nUser: {prompt}"
+    # Clear any existing progress bars
+    tqdm._instances.clear()
+    progress_bar = range(0, number_of_api_retry_attempts)
+    # Generate the model's response
+    if "Gemini" in model_source:
+        for i in progress_bar:
+            try:
+                print("Calling Gemini model, attempt", i + 1)
+                response = client.models.generate_content(
+                    model=model_choice, contents=full_prompt, config=config
+                )
+                # print("Successful call to Gemini model.")
+                break
+            except Exception as e:
+                # If fails, try again after X seconds in case there is a throttle limit
+                print(
+                    "Call to Gemini model failed:",
+                    e,
+                    " Waiting for ",
+                    str(timeout_wait),
+                    "seconds and trying again.",
+                )
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return (
+                    ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+                    conversation_history,
+                    response_text,
+                    num_transformer_input_tokens,
+                    num_transformer_generated_tokens,
+                )
+    elif "AWS" in model_source:
+        for i in progress_bar:
+            try:
+                print("Calling AWS Bedrock model, attempt", i + 1)
+                response = call_aws_bedrock(
+                    prompt,
+                    system_prompt,
+                    temperature,
+                    max_tokens,
+                    model_choice,
+                    bedrock_runtime=bedrock_runtime,
+                    assistant_prefill=assistant_prefill,
+                )
+                # print("Successful call to Claude model.")
+                break
+            except Exception as e:
+                # If fails, try again after X seconds in case there is a throttle limit
+                print(
+                    "Call to Bedrock model failed:",
+                    e,
+                    " Waiting for ",
+                    str(timeout_wait),
+                    "seconds and trying again.",
+                )
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return (
+                    ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+                    conversation_history,
+                    response_text,
+                    num_transformer_input_tokens,
+                    num_transformer_generated_tokens,
+                )
+    elif "Azure/OpenAI" in model_source:
+        for i in progress_bar:
+            try:
+                print("Calling Azure/OpenAI inference model, attempt", i + 1)
+                messages = [
+                    {
+                        "role": "system",
+                        "content": system_prompt,
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    },
+                ]
+                response_raw = client.chat.completions.create(
+                    messages=messages,
+                    model=model_choice,
+                    temperature=temperature,
+                    max_completion_tokens=max_tokens,
+                )
+                response_text = response_raw.choices[0].message.content
+                usage = getattr(response_raw, "usage", None)
+                input_tokens = 0
+                output_tokens = 0
+                if usage is not None:
+                    input_tokens = getattr(
+                        usage, "input_tokens", getattr(usage, "prompt_tokens", 0)
+                    )
+                    output_tokens = getattr(
+                        usage, "output_tokens", getattr(usage, "completion_tokens", 0)
+                    )
+                response = ResponseObject(
+                    text=response_text,
+                    usage_metadata={
+                        "inputTokens": input_tokens,
+                        "outputTokens": output_tokens,
+                    },
+                )
+                break
+            except Exception as e:
+                print(
+                    "Call to Azure/OpenAI model failed:",
+                    e,
+                    " Waiting for ",
+                    str(timeout_wait),
+                    "seconds and trying again.",
+                )
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return (
+                    ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+                    conversation_history,
+                    response_text,
+                    num_transformer_input_tokens,
+                    num_transformer_generated_tokens,
+                )
+    elif "Local" in model_source:
+        # This is the local model
+        for i in progress_bar:
+            try:
+                print("Calling local model, attempt", i + 1)
+                gen_config = LlamaCPPGenerationConfig()
+                gen_config.update_temp(temperature)
+                if USE_LLAMA_CPP == "True":
+                    response = call_llama_cpp_chatmodel(
+                        prompt, system_prompt, gen_config, model=local_model
+                    )
+                else:
+                    (
+                        response,
+                        num_transformer_input_tokens,
+                        num_transformer_generated_tokens,
+                    ) = call_transformers_model(
+                        prompt,
+                        system_prompt,
+                        gen_config,
+                        model=local_model,
+                        tokenizer=tokenizer,
+                        assistant_model=assistant_model,
+                    )
+                    response_text = response
+                break
+            except Exception as e:
+                # If fails, try again after X seconds in case there is a throttle limit
+                print(
+                    "Call to local model failed:",
+                    e,
+                    " Waiting for ",
+                    str(timeout_wait),
+                    "seconds and trying again.",
+                )
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return (
+                    ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+                    conversation_history,
+                    response_text,
+                    num_transformer_input_tokens,
+                    num_transformer_generated_tokens,
+                )
+    elif "inference-server" in model_source:
+        # This is the inference-server API
+        for i in progress_bar:
+            try:
+                print("Calling inference-server API, attempt", i + 1)
+                if api_url is None:
+                    raise ValueError(
+                        "api_url is required when model_source is 'inference-server'"
+                    )
+                gen_config = LlamaCPPGenerationConfig()
+                gen_config.update_temp(temperature)
+                response = call_inference_server_api(
+                    prompt,
+                    system_prompt,
+                    gen_config,
+                    api_url=api_url,
+                    model_name=model_choice,
+                )
+                break
+            except Exception as e:
+                # If fails, try again after X seconds in case there is a throttle limit
+                print(
+                    "Call to inference-server API failed:",
+                    e,
+                    " Waiting for ",
+                    str(timeout_wait),
+                    "seconds and trying again.",
+                )
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return (
+                    ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+                    conversation_history,
+                    response_text,
+                    num_transformer_input_tokens,
+                    num_transformer_generated_tokens,
+                )
+    else:
+        print("Model source not recognised")
+        return (
+            ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+            conversation_history,
+            response_text,
+            num_transformer_input_tokens,
+            num_transformer_generated_tokens,
+        )
+    # Update the conversation history with the new prompt and response
+    conversation_history.append({"role": "user", "parts": [prompt]})
+    # Check if is a LLama.cpp model response or inference-server response
+    if isinstance(response, ResponseObject):
+        response_text = response.text
+    elif "choices" in response:  # LLama.cpp model response or inference-server response
+        if "gpt-oss" in model_choice:
+            response_text = response["choices"][0]["message"]["content"].split(
+                "<|start|>assistant<|channel|>final<|message|>"
+            )[1]
+        else:
+            response_text = response["choices"][0]["message"]["content"]
+    elif model_source == "Gemini":
+        response_text = response.text
+    else:  # Assume transformers model response
+        if "gpt-oss" in model_choice:
+            response_text = response.split(
+                "<|start|>assistant<|channel|>final<|message|>"
+            )[1]
+        else:
+            response_text = response
+    # Replace multiple spaces with single space
+    response_text = re.sub(r" {2,}", " ", response_text)
+    response_text = response_text.strip()
+    conversation_history.append({"role": "assistant", "parts": [response_text]})
+    return (
+        response,
+        conversation_history,
+        response_text,
+        num_transformer_input_tokens,
+        num_transformer_generated_tokens,
+    )
+def process_requests(
+    prompts: List[str],
+    system_prompt: str,
+    conversation_history: List[dict],
+    whole_conversation: List[str],
+    whole_conversation_metadata: List[str],
+    client: ai.Client | OpenAI,
+    config: types.GenerateContentConfig,
+    model_choice: str,
+    temperature: float,
+    bedrock_runtime: boto3.Session.client,
+    model_source: str,
+    batch_no: int = 1,
+    local_model=list(),
+    tokenizer=None,
+    assistant_model=None,
+    master: bool = False,
+    assistant_prefill="",
+    api_url: str = None,
+) -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
+    """
+    Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
+    Args:
+        prompts (List[str]): A list of prompts to be processed.
+        system_prompt (str): The system prompt.
+        conversation_history (List[dict]): The history of the conversation.
+        whole_conversation (List[str]): The complete conversation including prompts and responses.
+        whole_conversation_metadata (List[str]): Metadata about the whole conversation.
+        client (object): The client to use for processing the prompts, from either Gemini or OpenAI client.
+        config (dict): Configuration for the model.
+        model_choice (str): The choice of model to use.
+        temperature (float): The temperature parameter for the model.
+        model_source (str): Source of the model, whether local, AWS, Gemini, or inference-server
+        batch_no (int): Batch number of the large language model request.
+        local_model: Local gguf model (if loaded)
+        master (bool): Is this request for the master table.
+        assistant_prefill (str, optional): Is there a prefill for the assistant response. Currently only working for AWS model calls
+        bedrock_runtime: The client object for boto3 Bedrock runtime
+        api_url (str, optional): The API URL for inference-server calls. Required when model_source is 'inference-server'.
+    Returns:
+        Tuple[List[ResponseObject], List[dict], List[str], List[str]]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, and the updated whole conversation metadata.
+    """
+    responses = list()
+    # Clear any existing progress bars
+    tqdm._instances.clear()
+    for prompt in prompts:
+        (
+            response,
+            conversation_history,
+            response_text,
+            num_transformer_input_tokens,
+            num_transformer_generated_tokens,
+        ) = send_request(
+            prompt,
+            conversation_history,
+            client=client,
+            config=config,
+            model_choice=model_choice,
+            system_prompt=system_prompt,
+            temperature=temperature,
+            local_model=local_model,
+            tokenizer=tokenizer,
+            assistant_model=assistant_model,
+            assistant_prefill=assistant_prefill,
+            bedrock_runtime=bedrock_runtime,
+            model_source=model_source,
+            api_url=api_url,
+        )
+        responses.append(response)
+        whole_conversation.append(system_prompt)
+        whole_conversation.append(prompt)
+        whole_conversation.append(response_text)
+        whole_conversation_metadata.append(f"Batch {batch_no}:")
+        try:
+            if "AWS" in model_source:
+                output_tokens = response.usage_metadata.get("outputTokens", 0)
+                input_tokens = response.usage_metadata.get("inputTokens", 0)
+            elif "Gemini" in model_source:
+                output_tokens = response.usage_metadata.candidates_token_count
+                input_tokens = response.usage_metadata.prompt_token_count
+            elif "Azure/OpenAI" in model_source:
+                input_tokens = response.usage_metadata.get("inputTokens", 0)
+                output_tokens = response.usage_metadata.get("outputTokens", 0)
+            elif "Local" in model_source:
+                if USE_LLAMA_CPP == "True":
+                    output_tokens = response["usage"].get("completion_tokens", 0)
+                    input_tokens = response["usage"].get("prompt_tokens", 0)
+                if USE_LLAMA_CPP == "False":
+                    input_tokens = num_transformer_input_tokens
+                    output_tokens = num_transformer_generated_tokens
+            elif "inference-server" in model_source:
+                # inference-server returns the same format as llama-cpp
+                output_tokens = response["usage"].get("completion_tokens", 0)
+                input_tokens = response["usage"].get("prompt_tokens", 0)
+            else:
+                input_tokens = 0
+                output_tokens = 0
+            whole_conversation_metadata.append(
+                "input_tokens: "
+                + str(input_tokens)
+                + " output_tokens: "
+                + str(output_tokens)
+            )
+        except KeyError as e:
+            print(f"Key error: {e} - Check the structure of response.usage_metadata")
+    return (
+        responses,
+        conversation_history,
+        whole_conversation,
+        whole_conversation_metadata,
+        response_text,
+    )
+def call_llm_with_markdown_table_checks(
+    batch_prompts: List[str],
+    system_prompt: str,
+    conversation_history: List[dict],
+    whole_conversation: List[str],
+    whole_conversation_metadata: List[str],
+    client: ai.Client | OpenAI,
+    client_config: types.GenerateContentConfig,
+    model_choice: str,
+    temperature: float,
+    reported_batch_no: int,
+    local_model: object,
+    tokenizer: object,
+    bedrock_runtime: boto3.Session.client,
+    model_source: str,
+    MAX_OUTPUT_VALIDATION_ATTEMPTS: int,
+    assistant_prefill: str = "",
+    master: bool = False,
+    CHOSEN_LOCAL_MODEL_TYPE: str = CHOSEN_LOCAL_MODEL_TYPE,
+    random_seed: int = seed,
+    api_url: str = None,
+) -> Tuple[List[ResponseObject], List[dict], List[str], List[str], str]:
+    """
+    Call the large language model with checks for a valid markdown table.
+    Parameters:
+    - batch_prompts (List[str]): A list of prompts to be processed.
+    - system_prompt (str): The system prompt.
+    - conversation_history (List[dict]): The history of the conversation.
+    - whole_conversation (List[str]): The complete conversation including prompts and responses.
+    - whole_conversation_metadata (List[str]): Metadata about the whole conversation.
+    - client (ai.Client | OpenAI): The client object for running Gemini or Azure/OpenAI API calls.
+    - client_config (types.GenerateContentConfig): Configuration for the model.
+    - model_choice (str): The choice of model to use.
+    - temperature (float): The temperature parameter for the model.
+    - reported_batch_no (int): The reported batch number.
+    - local_model (object): The local model to use.
+    - tokenizer (object): The tokenizer to use.
+    - bedrock_runtime (boto3.Session.client): The client object for boto3 Bedrock runtime.
+    - model_source (str): The source of the model, whether in AWS, Gemini, local, or inference-server.
+    - MAX_OUTPUT_VALIDATION_ATTEMPTS (int): The maximum number of attempts to validate the output.
+    - assistant_prefill (str, optional): The text to prefill the LLM response. Currently only working with AWS Claude calls.
+    - master (bool, optional): Boolean to determine whether this call is for the master output table.
+    - CHOSEN_LOCAL_MODEL_TYPE (str, optional): String to determine model type loaded.
+    - random_seed (int, optional): The random seed used for LLM generation.
+    - api_url (str, optional): The API URL for inference-server calls. Required when model_source is 'inference-server'.
+    Returns:
+    - Tuple[List[ResponseObject], List[dict], List[str], List[str], str]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, the updated whole conversation metadata, and the response text.
+    """
+    call_temperature = temperature  # This is correct now with the fixed parameter name
+    # Update Gemini config with the new temperature settings
+    client_config = types.GenerateContentConfig(
+        temperature=call_temperature, max_output_tokens=max_tokens, seed=random_seed
+    )
+    for attempt in range(MAX_OUTPUT_VALIDATION_ATTEMPTS):
+        # Process requests to large language model
+        (
+            responses,
+            conversation_history,
+            whole_conversation,
+            whole_conversation_metadata,
+            response_text,
+        ) = process_requests(
+            batch_prompts,
+            system_prompt,
+            conversation_history,
+            whole_conversation,
+            whole_conversation_metadata,
+            client,
+            client_config,
+            model_choice,
+            call_temperature,
+            bedrock_runtime,
+            model_source,
+            reported_batch_no,
+            local_model,
+            tokenizer=tokenizer,
+            master=master,
+            assistant_prefill=assistant_prefill,
+            api_url=api_url,
+        )
+        stripped_response = response_text.strip()
+        # Check if response meets our criteria (length and contains table) OR is "No change"
+        if (
+            len(stripped_response) > 120 and "|" in stripped_response
+        ) or stripped_response.lower().startswith("no change"):
+            if stripped_response.lower().startswith("no change"):
+                print(f"Attempt {attempt + 1} produced 'No change' response.")
+            else:
+                print(f"Attempt {attempt + 1} produced response with markdown table.")
+            break  # Success - exit loop
+        # Increase temperature for next attempt
+        call_temperature = temperature + (0.1 * (attempt + 1))
+        print(
+            f"Attempt {attempt + 1} resulted in invalid table: {stripped_response}. "
+            f"Trying again with temperature: {call_temperature}"
+        )
+    else:  # This runs if no break occurred (all attempts failed)
+        print(
+            f"Failed to get valid response after {MAX_OUTPUT_VALIDATION_ATTEMPTS} attempts"
+        )
+    return (
+        responses,
+        conversation_history,
+        whole_conversation,
+        whole_conversation_metadata,
+        stripped_response,
+    )
+def create_missing_references_df(
+    basic_response_df: pd.DataFrame, existing_reference_df: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Identifies references in basic_response_df that are not present in existing_reference_df.
+    Returns a DataFrame with the missing references and the character count of their responses.
+    Args:
+        basic_response_df (pd.DataFrame): DataFrame containing 'Reference' and 'Response' columns.
+        existing_reference_df (pd.DataFrame): DataFrame containing 'Response References' column.
+    Returns:
+        pd.DataFrame: A DataFrame with 'Missing Reference' and 'Response Character Count' columns.
+                      'Response Character Count' will be 0 for empty strings and NaN for actual missing data.
+    """
+    # Ensure columns are treated as strings for robust comparison
+    existing_references_unique = (
+        existing_reference_df["Response References"].astype(str).unique()
+    )
+    # Step 1: Identify all rows from basic_response_df that correspond to missing references
+    # We want the entire row to access the 'Response' column later
+    missing_data_rows = basic_response_df[
+        ~basic_response_df["Reference"].astype(str).isin(existing_references_unique)
+    ].copy()  # .copy() to avoid SettingWithCopyWarning
+    # Step 2: Create the new DataFrame
+    # Populate the 'Missing Reference' column directly
+    missing_df = pd.DataFrame({"Missing Reference": missing_data_rows["Reference"]})
+    # Step 3: Calculate and add 'Response Character Count'
+    # .str.len() works on Series of strings, handling empty strings (0) and NaN (NaN)
+    missing_df["Response Character Count"] = missing_data_rows["Response"].str.len()
+    # Optional: Add the actual response text for easier debugging/inspection if needed
+    # missing_df['Response Text'] = missing_data_rows['Response']
+    # Reset index to have a clean, sequential index for the new DataFrame
+    missing_df = missing_df.reset_index(drop=True)
+    return missing_df
+def calculate_tokens_from_metadata(
+    metadata_string: str, model_choice: str, model_name_map: dict
+):
+    """
+    Calculate the number of input and output tokens for given queries based on metadata strings.
+    Args:
+        metadata_string (str): A string containing all relevant metadata from the string.
+        model_choice (str): A string describing the model name
+        model_name_map (dict): A dictionary mapping model name to source
+    """
+    model_name_map[model_choice]["source"]
+    # Regex to find the numbers following the keys in the "Query summary metadata" section
+    # This ensures we get the final, aggregated totals for the whole query.
+    input_regex = r"input_tokens: (\d+)"
+    output_regex = r"output_tokens: (\d+)"
+    # re.findall returns a list of all matching strings (the captured groups).
+    input_token_strings = re.findall(input_regex, metadata_string)
+    output_token_strings = re.findall(output_regex, metadata_string)
+    # Convert the lists of strings to lists of integers and sum them up
+    total_input_tokens = sum([int(token) for token in input_token_strings])
+    total_output_tokens = sum([int(token) for token in output_token_strings])
+    number_of_calls = len(input_token_strings)
+    print(f"Found {number_of_calls} LLM call entries in metadata.")
+    print("-" * 20)
+    print(f"Total Input Tokens: {total_input_tokens}")
+    print(f"Total Output Tokens: {total_output_tokens}")
+    return total_input_tokens, total_output_tokens, number_of_calls

tools/prompts.py ADDED Viewed

	@@ -0,0 +1,260 @@

+###
+# System prompt
+###
+generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
+system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. {consultation_context}"""
+markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
+###
+# Initial topic table prompt
+###
+initial_table_system_prompt = system_prompt + markdown_additional_prompt
+initial_table_assistant_prefill = "|"
+default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do not write any other text in this column."
+initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
+In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
+In the second column named 'Subtopic', list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be empty.
+{sentiment_choices}{response_reference_format}
+In the final column named 'Summary', write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
+Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
+Response table:
+{response_table}
+New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
+###
+# Adding existing topics to consultation responses
+###
+add_existing_topics_system_prompt = system_prompt + markdown_additional_prompt
+add_existing_topics_assistant_prefill = "|"
+force_existing_topics_prompt = """Create a new markdown table. In the first column named 'Placeholder', write 'Not assessed'. In the second column named 'Subtopics', assign Topics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
+allow_new_topics_prompt = """Create a new markdown table. In the first column named 'General topic', and the second column named 'Subtopic', assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General topic and Subtopic for the Topic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
+force_single_topic_prompt = """ Assign each response to one single topic only."""
+add_existing_topics_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table, assigning responses from the Response table below to topics.
+{topic_assignment}{force_single_topic}
+{sentiment_choices}{response_reference_format}
+In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
+Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
+Choose from among the following topic names to assign to the responses, only if they are directly relevant to responses from the response table below:
+{topics}
+{response_table}
+New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
+###
+# VALIDATION PROMPTS
+###
+# These are prompts used to validate previous LLM outputs, and create corrected versions of the outputs if errors are found.
+validation_system_prompt = system_prompt
+validation_prompt_prefix_default = """The following instructions were previously provided to create an output table:\n'"""
+previous_table_introduction_default = (
+    """'\n\nThe following output table was created based on the above instructions:\n"""
+)
+validation_prompt_suffix_default = """\n\nBased on the above information, you need to create a corrected version of the output table. Examples of issues to correct include:
+- Remove rows where responses are not relevant to the assigned topic, or where responses are not relevant to any topic.
+- Remove rows where a topic is not assigned to any specific response.
+- If the current topic assignment does not cover all information in a response, assign responses to relevant topics from the suggested topics table, or create a new topic if necessary.
+- Correct any false information in the summary column, which is a summary of the relevant response text.
+{additional_validation_issues}
+- Any other obvious errors that you can identify.
+With the above issues in mind, create a new, corrected version of the markdown table below. If there are no issues to correct, write simply "No change". Return only the corrected table without additional text, or 'no change' alone."""
+validation_prompt_suffix_struct_summary_default = """\n\nBased on the above information, you need to create a corrected version of the output table. Examples of issues to correct include:
+- Any misspellings in the Main heading or Subheading columns
+- Correct any false information in the summary column, which is a summary of the relevant response text.
+{additional_validation_issues}
+- Any other obvious errors that you can identify.
+With the above issues in mind, create a new, corrected version of the markdown table below. If there are no issues to correct, write simply "No change". Return only the corrected table without additional text, or 'no change' alone."""
+###
+# SENTIMENT CHOICES
+###
+negative_neutral_positive_sentiment_prompt = (
+    "write the sentiment of the Subtopic: Negative, Neutral, or Positive"
+)
+negative_or_positive_sentiment_prompt = (
+    "write the sentiment of the Subtopic: Negative or Positive"
+)
+do_not_assess_sentiment_prompt = "write the text 'Not assessed'"  # Not used anymore. Instead, the column is filled in automatically with 'Not assessed'
+default_sentiment_prompt = (
+    "write the sentiment of the Subtopic: Negative, Neutral, or Positive"
+)
+###
+# STRUCTURED SUMMARY PROMPT
+###
+structured_summary_prompt = """Your task is to write a structured summary for open text responses.
+Create a new markdown table based on the response table below with the headings 'Main heading', 'Subheading' and 'Summary'.
+For each of the responses in the Response table, you will create a row for each summary associated with each of the Main headings and Subheadings from the Headings table. If there is no Headings table, created your own headings. In the first and second columns, write a Main heading and Subheading from the Headings table.  Then in Summary, write a detailed and comprehensive summary that covers all information relevant to the Main heading and Subheading on the same row.
+{summary_format}
+Do not add any other columns. Do not add any other text to your response.
+{response_table}
+Headings to structure the summary are in the following table:
+{topics}
+New table:"""
+###
+# SUMMARISE TOPICS PROMPT
+###
+summary_assistant_prefill = ""
+summarise_topic_descriptions_system_prompt = system_prompt
+summarise_topic_descriptions_prompt = """Your task is to make a consolidated summary of the text below. {summary_format}
+Return only the summary and no other text:
+{summaries}
+Summary:"""
+single_para_summary_format_prompt = "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"
+two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
+###
+# OVERALL SUMMARY PROMPTS
+###
+summarise_everything_system_prompt = system_prompt
+summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
+'{topic_summary_table}'
+Your task is to summarise the above table. {summary_format}. Return only the summary and no other text.
+Summary:"""
+comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General topic' or 'Subtopic' directly in the summary. Format the output for Excel display using: **bold text** for main headings, • bullet points for sub-items, and line breaks between sections. Avoid markdown symbols like # or ##."
+comprehensive_summary_format_prompt_by_group = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General topic' or 'Subtopic' directly in the summary. Compare and contrast differences between the topics and themes from each Group. Format the output for Excel display using: **bold text** for main headings, • bullet points for sub-items, and line breaks between sections. Avoid markdown symbols like # or ##."
+# Alternative Excel formatting options
+excel_rich_text_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General topic' or 'Subtopic' directly in the summary. Format for Excel using: BOLD for main headings, bullet points (•) for sub-items, and line breaks between sections. Use simple text formatting that Excel can interpret."
+excel_plain_text_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General topic' or 'Subtopic' directly in the summary. Format as plain text with clear structure: use ALL CAPS for main headings, bullet points (•) for sub-items, and line breaks between sections. Avoid any special formatting symbols."
+###
+# LLM-BASED TOPIC DEDUPLICATION PROMPTS
+###
+llm_deduplication_system_prompt = """You are an expert at analysing and consolidating topic categories. Your task is to identify semantically similar topics that should be merged together, even if they use different wording or synonyms."""
+llm_deduplication_prompt = """You are given a table of topics with their General topics, Subtopics, and Sentiment classifications. Your task is to identify topics that are semantically similar and should be merged together. Only merge topics that are almost identical in terms of meaning - if in doubt, do not merge.
+Analyse the following topics table and identify groups of topics that describe essentially the same concept but may use different words or phrases. For example:
+- "Transportation issues" and "Public transport problems"
+- "Housing costs" and "Rent prices"
+- "Environmental concerns" and "Green issues"
+Create a markdown table with the following columns:
+1. 'Original General topic' - The current general topic name
+2. 'Original Subtopic' - The current subtopic name
+3. 'Original Sentiment' - The current sentiment
+4. 'Merged General topic' - The consolidated general topic name (use the most descriptive)
+5. 'Merged Subtopic' - The consolidated subtopic name (use the most descriptive)
+6. 'Merged Sentiment' - The consolidated sentiment (use 'Mixed' if sentiments differ)
+7. 'Merge Reason' - Brief explanation of why these topics should be merged
+Only include rows where topics should actually be merged. If a topic has no semantic duplicates, do not include it in the table. Produce only a markdown table in the format described above. Do not add any other text to your response.
+Topics to analyse:
+{topics_table}
+Merged topics table:"""
+llm_deduplication_prompt_with_candidates = """You are given a table of topics with their General topics, Subtopics, and Sentiment classifications. Your task is to identify topics that are semantically similar and should be merged together, even if they use different wording.
+Additionally, you have been provided with a list of candidate topics that represent preferred topic categories. When merging topics, prioritise fitting similar topics into these existing candidate categories rather than creating new ones. Only merge topics that are almost identical in terms of meaning - if in doubt, do not merge.
+Analyse the following topics table and identify groups of topics that describe essentially the same concept but may use different words or phrases. For example:
+- "Transportation issues" and "Public transport problems"
+- "Housing costs" and "Rent prices"
+- "Environmental concerns" and "Green issues"
+When merging topics, consider the candidate topics provided below and try to map similar topics to these preferred categories when possible.
+Create a markdown table with the following columns:
+1. 'Original General topic' - The current general topic name
+2. 'Original Subtopic' - The current subtopic name
+3. 'Original Sentiment' - The current sentiment
+4. 'Merged General topic' - The consolidated general topic name (prefer candidate topics when similar)
+5. 'Merged Subtopic' - The consolidated subtopic name (prefer candidate topics when similar)
+6. 'Merged Sentiment' - The consolidated sentiment (use 'Mixed' if sentiments differ)
+7. 'Merge Reason' - Brief explanation of why these topics should be merged
+Only include rows where topics should actually be merged. If a topic has no semantic duplicates, do not include it in the table. Produce only a markdown table in the format described above. Do not add any other text to your response.
+Topics to analyse:
+{topics_table}
+Candidate topics to consider for mapping:
+{candidate_topics_table}
+Merged topics table:"""
+###
+# VERIFY EXISTING DESCRIPTIONS/TITLES - Currently not used
+###
+verify_assistant_prefill = "|"
+verify_titles_system_prompt = system_prompt
+verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
+{response_table}
+The criteria for a suitable description for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
+Create a markdown table with four columns.
+The first column is 'Response References', and should contain just the response number under consideration.
+The second column is 'Is this a suitable description', answer the question with 'Yes' or 'No', with no other text.
+The third column is 'Explanation', give a short explanation for your response in the second column.
+The fourth column is 'Alternative description', suggest an alternative description for the response that meet the criteria stated above.
+Do not add any other text to your response.
+Output markdown table:"""
+## The following didn't work well in testing and so is not currently used
+create_general_topics_system_prompt = system_prompt
+create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
+{topics}
+Your task is to create a General topic name for each Subtopic. The new Topics table should have the columns 'General topic' and 'Subtopic' only. Write a 'General topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
+New Topics table:"""

windows_install_llama-cpp-python.txt ADDED Viewed

	@@ -0,0 +1,111 @@

+---
+#How to build llama-cpp-python on Windows: Step-by-Step Guide
+First, you need to set up a proper C++ development environment.
+# Step 1: Install the C++ Compiler
+Scroll down the page past the main programs to "Tools for Visual Studio" and download the "Build Tools for Visual Studio". This is a standalone installer that gives you the C++ compiler and libraries without installing the full Visual Studio IDE.
+Run the installer. In the "Workloads" tab, check the box for "Desktop development with C++".
+MSVC v143
+C++ ATL
+C++ Profiling tools
+C++ CMake tools for Windows
+C++ MFC
+C++ Modules
+Windows 10 SDK (10.0.20348.0)
+Proceed with the installation.
+Need to use 'x64 Native Tools Command Prompt for VS 2022' to install the below. Run as administrator
+# Step 2: Install CMake
+Go to the CMake download page: https://cmake.org/download
+Download the latest Windows installer (e.g., cmake-x.xx.x-windows-x86_64.msi).
+Run the installer. Crucially, when prompted, select the option to "Add CMake to the system PATH for all users" or "for the current user." This allows you to run cmake from any command prompt.
+# Step 3: (FOR CPU INFERENCE ONLY) Download and Place OpenBLAS
+This is often the trickiest part.
+Go to the OpenBLAS releases on GitHub.
+Find a recent release and download the pre-compiled version for Windows. It will typically be a file named something like OpenBLAS-0.3.21-x64.zip (the version number will change). Make sure you get the 64-bit (x64) version if you are using 64-bit Python.
+Create a folder somewhere easily accessible, for example, C:\libs\.
+Extract the contents of the OpenBLAS zip file into that folder. Your final directory structure should look something like this:
+C:\libs\OpenBLAS\
+├── bin\
+├── include\
+└── lib\
+## 3.b. Install Chocolatey
+https://chocolatey.org/install
+Step 1: Install Chocolatey (if you don't already have it)
+Open PowerShell as an Administrator. (Right-click the Start Menu -> "Windows PowerShell (Admin)" or "Terminal (Admin)").
+Run the following command to install Chocolatey. It's a single, long line:
+Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
+Once it's done, close the Administrator PowerShell window.
+Step 2: Install pkg-config-lite using Chocolatey
+IMPORTANT: Open a NEW command prompt or PowerShell window (as a regular user is fine). This is necessary so it recognises the new choco command.
+Run the following command in console to install a lightweight version of pkg-config:
+choco install pkgconfiglite
+Approve the installation by typing Y or A if prompted.
+# Step 4: Run the Installation Command
+Now you have all the pieces. The final step is to run the command in a terminal that is aware of your new build environment.
+Open the "Developer Command Prompt for VS" from your Start Menu. This is important! This special command prompt automatically configures all the necessary paths for the C++ compiler.
+## For CPU
+set PKG_CONFIG_PATH=C:\<path-to-openblas>\OpenBLAS\lib\pkgconfig # Set this in environment variables
+pip install llama-cpp-python==0.3.16 --force-reinstall --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
+pip install llama-cpp-python==0.3.16 --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/Users/s_cas/libs/OpenBLAS/include;-DBLAS_LIBRARIES=C:/Users/s_cas/OpenBLAS/lib/libopenblas.lib";-DPKG_CONFIG_PATH=C:/users/s_cas/openblas/lib/pkgconfig"
+or to make a wheel:
+pip install llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
+pip wheel llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/Users/<user>/libs/OpenBLAS/include;-DBLAS_LIBRARIES=C:/Users/<user>/libs/OpenBLAS/lib/libopenblas.lib"
+## With Cuda (NVIDIA GPUs only)
+Make sure that the have the CUDA 12.4 toolkit for windows installed: https://developer.nvidia.com/cuda-12-4-0-download-archive
+### Make sure you are using the x64 version of Developer command tools for the below, e.g. 'x64 Native Tools Command Prompt for VS 2022' ###
+Use NVIDIA GPU (cuBLAS): If you have an NVIDIA GPU, using cuBLAS is often easier because the CUDA Toolkit installer handles most of the setup.
+Install the NVIDIA CUDA Toolkit.
+Run the install command specifying cuBLAS (for faster inference):
+pip install llama-cpp-python==0.3.16 --force-reinstall --verbose -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"
+If you want to create a new wheel to help with future installs, you can run:
+cd first to a folder that you have edit access for
+pip wheel llama-cpp-python==0.3.16 --wheel-dir dist --verbose -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"