Spaces:

leezhuuu
/

comsol-rag-expert

Running

App Files Files Community

leezhuuu commited on 10 days ago

Commit

b38ace2

verified ·

1 Parent(s): 0ba9cfe

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +257 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,259 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+import numpy as np
+import jieba
+import requests
+import time
+import os
+from openai import OpenAI
+from rank_bm25 import BM25Okapi
+from sklearn.metrics.pairwise import cosine_similarity
+# ================= 1. 安全配置与初始化 =================
+# 尝试从环境变量获取 Key (Docker/HF Space 标准做法)
+API_KEY = os.getenv("SILICONFLOW_API_KEY")
+# 页面基础设置
+st.set_page_config(
+    page_title="COMSOL Dark Expert",
+    page_icon="🌌",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# 安全检查：如果没有配置 Key，拦截运行，避免公开应用报错泄露信息
+if not API_KEY:
+    st.error("⚠️ 未检测到 API Key。")
+    st.info("请在 Hugging Face Space 的 'Settings' -> 'Variables and secrets' 中添加名为 `SILICONFLOW_API_KEY` 的 Secret。")
+    st.stop()
+# API 配置
+API_BASE = "https://api.siliconflow.cn/v1"
+EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-4B"
+RERANK_MODEL = "Qwen/Qwen3-Reranker-4B"
+GEN_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+# 数据源配置
+DATA_URL = "https://share.leezhu.cn/graduation_design_data/comsol_embedded.parquet"
+LOCAL_DATA_PATH = "/app/comsol_embedded.parquet" # Docker 容器内的路径，或者直接用 "comsol_embedded.parquet"
+# ================= 2. 资源加载函数 (缓存化) =================
+@st.cache_resource
+def load_data_and_engine():
+    """下载数据并初始化检索引擎，全局只运行一次"""
+    # 1. 自动下载数据
+    if not os.path.exists(LOCAL_DATA_PATH):
+        try:
+            print(f"正在从 {DATA_URL} 下载数据...")
+            headers = {'User-Agent': 'Mozilla/5.0'} # 防止被简单的反爬拦截
+            r = requests.get(DATA_URL, headers=headers, stream=True)
+            r.raise_for_status()
+            with open(LOCAL_DATA_PATH, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            print("✅ 数据下载完成")
+        except Exception as e:
+            st.error(f"❌ 数据文件下载失败: {str(e)}")
+            st.stop()
+    # 2. 初始化引擎
+    return FullRetriever(LOCAL_DATA_PATH)
+# ================= 3. 核心后端类 =================
+class RerankClient:
+    def __init__(self, api_base, api_key, model):
+        self.api_url = f"{api_base}/rerank"
+        self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+        self.model = model
+    def rerank(self, query: str, documents: list, top_n: int = 5):
+        if not documents: return []
+        payload = {"model": self.model, "query": query, "documents": documents, "top_n": top_n}
+        try:
+            response = requests.post(self.api_url, headers=self.headers, json=payload, timeout=30)
+            response.raise_for_status()
+            return response.json()['results']
+        except Exception as e:
+            print(f"Rerank Warning: {e}")
+            # 降级处理：如果不通，按原顺序返回
+            return [{"index": i, "relevance_score": 0.0} for i in range(len(documents))]
+class FullRetriever:
+    def __init__(self, parquet_path):
+        try:
+            self.df = pd.read_parquet(parquet_path)
+        except Exception as e:
+            raise RuntimeError(f"Parquet 读取失败: {e}")
+        self.documents = self.df['content'].tolist()
+        # 确保 embedding 列是 numpy 数组
+        self.embeddings = np.stack(self.df['embedding'].values)
+        self.bm25 = BM25Okapi([jieba.lcut(str(d).lower()) for d in self.documents])
+        self.client = OpenAI(base_url=API_BASE, api_key=API_KEY)
+        self.reranker = RerankClient(API_BASE, API_KEY, RERANK_MODEL)
+    def _get_emb(self, q):
+        try:
+            resp = self.client.embeddings.create(model=EMBEDDING_MODEL, input=[q])
+            return resp.data[0].embedding
+        except Exception:
+            return [0.0] * 1024 # 防止 API 挂掉时整个应用崩溃
+    def hybrid_search(self, query: str, top_k=5):
+        # 1. 向量检索
+        query_emb = self._get_emb(query)
+        vec_scores = cosine_similarity([query_emb], self.embeddings)[0]
+        vec_idx = np.argsort(vec_scores)[-100:][::-1]
+        # 2. 关键词检索
+        kw_idx = np.argsort(self.bm25.get_scores(jieba.lcut(query.lower())))[-100:][::-1]
+        # 3. RRF 融合
+        fused = {}
+        for r, i in enumerate(vec_idx): fused[i] = fused.get(i, 0) + 1/(60+r+1)
+        for r, i in enumerate(kw_idx): fused[i] = fused.get(i, 0) + 1/(60+r+1)
+        c_idxs = [x[0] for x in sorted(fused.items(), key=lambda x:x[1], reverse=True)[:50]]
+        c_docs = [self.documents[i] for i in c_idxs]
+        # 4. 重排序
+        results = self.reranker.rerank(query, c_docs, top_n=top_k)
+        final_res = []
+        context = ""
+        for i, item in enumerate(results):
+            orig_idx = c_idxs[item['index']]
+            row = self.df.iloc[orig_idx]
+            final_res.append({
+                "rank": i+1,
+                "score": item['relevance_score'],
+                "filename": row['filename'],
+                "content": row['content']
+            })
+            context += f"[文档{i+1}]: {row['content']}\n\n"
+        return final_res, context
+# ================= 4. UI 渲染 =================
+# CSS 样式注入
+st.markdown("""
+<style>
+    .stApp { background-color: #0E1117; color: #E0E0E0; }
+    .main-header {
+        background: linear-gradient(90deg, #0f2027 0%, #203a43 50%, #2c5364 100%);
+        padding: 1.5rem; border-radius: 0 0 15px 15px; color: #fff;
+        margin-bottom: 2rem; display: flex; align-items: center; justify-content: space-between;
+    }
+    .header-title { font-size: 1.8rem; font-weight: 700; color: white; margin:0;}
+    [data-testid="stChatMessage"] { background-color: #1E1E1E; border: 1px solid #333; }
+    .ref-card {
+        background-color: #161B22; border: 1px solid #30363D;
+        border-left: 4px solid #29B5E8; padding: 12px; margin-bottom: 12px;
+    }
+    .ref-title { font-weight: 600; color: #58A6FF; font-size: 0.95rem; }
+    .ref-snippet { font-size: 0.85rem; color: #8B949E; margin-top: 5px; font-family: monospace;}
+</style>
+""", unsafe_allow_html=True)
+def main():
+    # 顶部栏
+    st.markdown("""
+    <div class="main-header">
+        <div>
+            <div class="header-title">COMSOL 智能仿真专家</div>
+            <div style="color: #bbb; font-size: 0.8rem;">V3.0 Dark | Secured Docker Edition</div>
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+    # 加载引擎 (包含下载逻辑)
+    with st.spinner("🚀 正在从云端同步数据并初始化神经中枢..."):
+        retriever = load_data_and_engine()
+    # 侧边栏
+    with st.sidebar:
+        st.header("🛠️ 参数控制")
+        top_k = st.slider("检索深度", 1, 10, 4)
+        temp = st.slider("发散度", 0.0, 1.0, 0.3)
+        if st.button("🧹 清空会话"):
+            st.session_state.messages = []
+            st.session_state.current_refs = []
+            st.rerun()
+    # 状态初始化
+    if "messages" not in st.session_state: st.session_state.messages = []
+    if "current_refs" not in st.session_state: st.session_state.current_refs = []
+    # 布局
+    col_chat, col_evidence = st.columns([0.65, 0.35], gap="large")
+    with col_chat:
+        for msg in st.session_state.messages:
+            with st.chat_message(msg["role"]):
+                st.markdown(msg["content"])
+        if prompt := st.chat_input("COMSOL 问题咨询..."):
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"): st.markdown(prompt)
+            # 检索阶段
+            with st.status("📡 正在检索知识库...", expanded=False):
+                refs, context = retriever.hybrid_search(prompt, top_k=top_k)
+                st.session_state.current_refs = refs
+            # 生成阶段
+            system_prompt = f"""你是一个COMSOL专家。请根据以下参考文档回答问题。如果文档无相关信息，请明确告知。
+            参考文档：
+            {context}
+            """
+            with st.chat_message("assistant"):
+                resp_cont = st.empty()
+                full_resp = ""
+                # 创建新的 Client 实例 (使用全局 API_KEY)
+                client = OpenAI(base_url=API_BASE, api_key=API_KEY)
+                try:
+                    stream = client.chat.completions.create(
+                        model=GEN_MODEL_NAME,
+                        messages=[
+                            {"role": "system", "content": system_prompt},
+                            *st.session_state.messages[-6:] # 携带最近历史
+                        ],
+                        temperature=temp,
+                        stream=True
+                    )
+                    for chunk in stream:
+                        txt = chunk.choices[0].delta.content
+                        if txt:
+                            full_resp += txt
+                            resp_cont.markdown(full_resp + "▌")
+                    resp_cont.markdown(full_resp)
+                    st.session_state.messages.append({"role": "assistant", "content": full_resp})
+                    st.rerun() # 强制刷新以更新右侧证据
+                except Exception as e:
+                    st.error(f"生成中断: {e}")
+    with col_evidence:
+        st.caption("📚 检索到的证据")
+        if st.session_state.current_refs:
+            for ref in st.session_state.current_refs:
+                st.markdown(f"""
+                <div class="ref-card">
+                    <div class="ref-title">📄 {ref['filename']} (Score: {ref['score']:.2f})</div>
+                    <div class="ref-snippet">{ref['content'][:120]}...</div>
+                </div>
+                """, unsafe_allow_html=True)
+                with st.expander("展开全文"):
+                    st.text(ref['content'])
+        else:
+            st.info("暂无检索数据")
+if __name__ == "__main__":
+    main()