Spaces:

Vardhan-kuppala
/

Microcontroller_selection_assistant

Sleeping

App Files Files Community

Microcontroller_selection_assistant / app.py

Vardhan-kuppala

Upload 3 files

abf8cf8 verified over 1 year ago

raw

history blame contribute delete

21.1 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from typing import List, Dict, Tuple, Optional
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.memory import ConversationBufferMemory
	from langchain_community.vectorstores import FAISS
	from langchain.docstore.document import Document
	from langchain_huggingface import HuggingFaceEndpoint
	from langchain.chains import ConversationalRetrievalChain
	from langchain.prompts import PromptTemplate
	import os

	# Configuration
	MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
	api_token = os.getenv("HF_TOKEN")

	# Define system message for consistent LLM behavior
	SYSTEM_MESSAGE = """You are a microcontroller selection expert assistant. Your task is to:
	1. Analyze user requirements carefully
	2. Compare available microcontrollers based on ALL provided specifications
	3. Recommend the best matches with detailed explanations
	4. Consider trade-offs between different features
	5. Highlight any potential concerns or limitations

	When making recommendations:
	- Always mention specific model numbers and their key features
	- Explain why each recommendation matches the requirements
	- Compare pros and cons between recommendations
	- Note any missing specifications that might be important"""

	# Custom prompt templates
	CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""
	Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question that captures all relevant context from the conversation.

	Chat History:
	{chat_history}

	Follow Up Input: {question}

	Standalone question:""")

	QA_PROMPT = PromptTemplate.from_template("""
	{system_message}

	Context information from microcontroller database:
	{context}

	User Query: {question}

	Provide a detailed response following these steps:
	1. Analyze Requirements: Clearly state the key requirements from the query
	2. Matching Products: List and compare the best matching microcontrollers
	3. Feature Analysis: Detail how each recommended product meets the requirements
	4. Trade-offs: Explain any compromises or trade-offs
	5. Additional Considerations: Mention any important factors the user should consider

	Response:""")

	def validate_excel_format(df: pd.DataFrame) -> bool:
	"""Validate if Excel file has required specifications as columns"""
	expected_specs = [
	'Product ID', 'Product Title', 'PLP', 'Bit Size', 'cpu',
	'Program Memory (KB)', 'Data Flash (KB)', 'RAM (KB)',
	'Lead Count (#)', 'Supply Voltage (V)', 'Operating Freq (Max) (MHz)',
	'RTC', 'LVD or PVD', 'DMA', 'I/O Ports', 'Timer', 'ADC', 'DAC',
	'Ethernet', 'USB', 'UART', 'SPI', 'I2C', 'CAN', 'LIN',
	'Human machine interface', 'pkg.Type', 'Temp.Range'
	]

	# Check if at least the essential columns exist
	essential_specs = ['Product ID', 'Product Title', 'Bit Size', 'cpu']
	missing_essential = [col for col in essential_specs if col not in df.columns]

	if missing_essential:
	print(f"Missing essential columns: {missing_essential}")
	return False

	# Print found and missing columns for debugging
	found_specs = [col for col in expected_specs if col in df.columns]
	missing_specs = [col for col in expected_specs if col not in df.columns]

	print("Found specifications:", found_specs)
	print("Missing specifications:", missing_specs)

	return True


	def normalize_column_name(col_name: str) -> str:
	"""Normalize column names to handle different variations"""
	# Convert to lowercase and remove special characters
	normalized = str(col_name).lower().strip()
	normalized = ''.join(c for c in normalized if c.isalnum() or c.isspace())

	# Common variations mapping
	variations = {
	'productid': 'Product ID',
	'producttitle': 'Product Title',
	'programmemorykb': 'Program Memory (KB)',
	'programmemory': 'Program Memory (KB)',
	'flashmemory': 'Program Memory (KB)',
	'dataflashkb': 'Data Flash (KB)',
	'dataflash': 'Data Flash (KB)',
	'ramkb': 'RAM (KB)',
	'ram': 'RAM (KB)',
	'bitsize': 'Bit Size',
	'cpucore': 'cpu',
	'processor': 'cpu',
	'supplyvoltage': 'Supply Voltage (V)',
	'voltage': 'Supply Voltage (V)',
	'operatingfreq': 'Operating Freq (Max) (MHz)',
	'frequency': 'Operating Freq (Max) (MHz)',
	'maxfreq': 'Operating Freq (Max) (MHz)',
	'leadcount': 'Lead Count (#)',
	'pins': 'Lead Count (#)',
	'pincount': 'Lead Count (#)',
	'interface': 'I/O Ports',
	'ioports': 'I/O Ports',
	'packagetype': 'pkg.Type',
	'package': 'pkg.Type',
	'temprange': 'Temp.Range',
	'temperature': 'Temp.Range',
	'humanmachineinterface': 'Human machine interface',
	'hmi': 'Human machine interface'
	}

	# Return original if no mapping found
	return variations.get(normalized.replace(' ', ''), col_name)

	def validate_and_map_columns(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
	"""Validate and map Excel columns to standard names"""
	# Create mapping of found columns
	column_mapping = {}
	new_columns = []

	for col in df.columns:
	normalized_name = normalize_column_name(col)
	column_mapping[col] = normalized_name
	new_columns.append(normalized_name)

	# Rename columns in DataFrame
	df.columns = new_columns

	# Print found specifications for debugging
	print("Found specifications:", new_columns)

	return df, column_mapping


	def clean_excel_data(df: pd.DataFrame) -> pd.DataFrame:
	"""Clean and prepare Excel data with flexible handling"""
	# Replace various forms of empty/NA values
	df = df.replace([np.nan, 'N/A', 'NA', '-', 'None', 'none', 'nil', 'NIL'], '')

	# Numeric columns with their units
	numeric_specs = {
	'Program Memory (KB)': 'KB',
	'Data Flash (KB)': 'KB',
	'RAM (KB)': 'KB',
	'Lead Count (#)': '',
	'Supply Voltage (V)': 'V',
	'Operating Freq (Max) (MHz)': 'MHz'
	}

	# Process each numeric column if it exists
	for col, unit in numeric_specs.items():
	if col in df.columns:
	# Extract numeric values from string if needed
	df[col] = df[col].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

	# Clean boolean/feature columns
	feature_cols = ['RTC', 'DMA', 'Ethernet', 'USB', 'UART', 'SPI', 'I2C', 'CAN', 'LIN']
	for col in feature_cols:
	if col in df.columns:
	df[col] = df[col].astype(str).str.lower()
	# Map various positive indicators to 'Yes'
	df[col] = df[col].apply(lambda x: 'Yes' if x in ['yes', 'y', '1', 'true', 'available', 'supported', '✓', '√'] else 'No')

	return df

	def process_mc_excel(excel_file: str) -> Tuple[List[Document], Optional[str]]:
	"""Convert microcontroller Excel data to Document objects with flexible handling"""
	try:
	print(f"Reading Excel file: {excel_file}")
	df = pd.read_excel(excel_file)
	print(f"Excel file loaded. Shape: {df.shape}")

	# Validate and map columns
	df, column_mapping = validate_and_map_columns(df)
	df = clean_excel_data(df)

	# Define feature groups with optional fields
	feature_groups = {
	'core_specs': {
	'title': 'Core Specifications',
	'fields': ['Product ID', 'Product Title', 'PLP', 'Bit Size', 'cpu'],
	'required': ['Product ID', 'Product Title'] # Minimum required fields
	},
	'memory': {
	'title': 'Memory',
	'fields': ['Program Memory (KB)', 'Data Flash (KB)', 'RAM (KB)'],
	'required': []
	},
	'communication': {
	'title': 'Communication Interfaces',
	'fields': ['Ethernet', 'USB', 'UART', 'SPI', 'I2C', 'CAN', 'LIN'],
	'required': []
	},
	'peripherals': {
	'title': 'Peripherals',
	'fields': ['Timer', 'ADC', 'DAC', 'RTC', 'DMA'],
	'required': []
	},
	'power': {
	'title': 'Power and Performance',
	'fields': ['Supply Voltage (V)', 'Operating Freq (Max) (MHz)', 'LVD or PVD'],
	'required': []
	},
	'physical': {
	'title': 'Physical Specifications',
	'fields': ['Lead Count (#)', 'pkg.Type', 'Temp.Range'],
	'required': []
	},
	'interface': {
	'title': 'Interfaces',
	'fields': ['I/O Ports', 'Human machine interface'],
	'required': []
	}
	}

	# Check for minimum required fields
	required_fields = set()
	for group in feature_groups.values():
	required_fields.update(group['required'])

	missing_required = [field for field in required_fields if field not in df.columns]
	if missing_required:
	return [], f"Missing essential columns: {', '.join(missing_required)}"

	documents = []
	for idx, row in df.iterrows():
	content_parts = []

	for group_name, group_info in feature_groups.items():
	group_content = []
	for field in group_info['fields']:
	if field in df.columns and pd.notna(row.get(field)) and str(row.get(field)).strip() != '':
	value = row[field]
	if isinstance(value, (int, float)):
	if 'KB' in field:
	value = f"{value:g} KB"
	elif 'MHz' in field:
	value = f"{value:g} MHz"
	elif 'V' in field:
	value = f"{value:g}V"
	else:
	value = f"{value:g}"
	group_content.append(f"{field}: {value}")

	if group_content:
	content_parts.append(f"{group_info['title']}:\n" + "\n".join(group_content))

	# Create content string
	content = "\n\n".join(content_parts)

	# Create metadata with available fields
	metadata = {
	"source": "excel",
	"row": idx,
	"product_id": str(row.get('Product ID', '')),
	"product_title": str(row.get('Product Title', '')),
	}

	# Add optional metadata if available
	optional_metadata = {
	"bit_size": "Bit Size",
	"cpu": "cpu",
	"memory": "Program Memory (KB)",
	"interfaces": ["USB", "Ethernet", "CAN", "SPI", "I2C"]
	}

	for meta_key, field in optional_metadata.items():
	if isinstance(field, list):
	# Handle interface list
	metadata[meta_key] = [intf for intf in field if intf in df.columns and row.get(intf) == 'Yes']
	elif field in df.columns:
	value = row.get(field)
	if pd.notna(value) and str(value).strip() != '':
	if field == 'Program Memory (KB)':
	metadata[meta_key] = f"{value} KB"
	else:
	metadata[meta_key] = str(value)

	doc = Document(page_content=content, metadata=metadata)
	documents.append(doc)

	if not documents:
	return [], "No valid microcontroller data found in Excel file."

	print(f"Successfully processed {len(documents)} microcontrollers")
	return documents, None

	except Exception as e:
	import traceback
	print("Excel processing error:")
	print(traceback.format_exc())
	return [], f"Error processing Excel file: {str(e)}"

	def create_vector_db(documents: List[Document]) -> Optional[FAISS]:
	"""Create FAISS vector database with error handling"""
	try:
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=2048, # Larger chunk size for complete spec retention
	chunk_overlap=200,
	separators=["\n\n", "\n", ". ", ", ", " "]
	)

	splits = text_splitter.split_documents(documents)

	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-mpnet-base-v2"
	)

	return FAISS.from_documents(splits, embeddings)

	except Exception as e:
	print(f"Error creating vector database: {str(e)}")
	return None

	def initialize_llm_chain(vector_db):
	"""Initialize LLM chain with enhanced prompting"""
	try:
	llm = HuggingFaceEndpoint(
	repo_id=MODEL_NAME,
	huggingfacehub_api_token=api_token,
	temperature=0.3,
	max_new_tokens=2048,
	top_k=5,
	repetition_penalty=1.1
	)

	memory = ConversationBufferMemory(
	memory_key="chat_history",
	output_key='answer',
	return_messages=True
	)

	retriever = vector_db.as_retriever(
	search_type="mmr",
	search_kwargs={
	"k": 5,
	"fetch_k": 8,
	"lambda_mult": 0.7
	}
	)

	qa_prompt = QA_PROMPT.partial(system_message=SYSTEM_MESSAGE)

	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=memory,
	return_source_documents=True,
	condense_question_prompt=CONDENSE_QUESTION_PROMPT,
	combine_docs_chain_kwargs={'prompt': qa_prompt}
	)

	return chain

	except Exception as e:
	print(f"Error initializing LLM chain: {str(e)}")
	return None

	def format_mc_response(source_doc: Document) -> str:
	"""Format microcontroller source documents for display with robust metadata handling"""
	try:
	if source_doc.metadata.get('source') == 'excel':
	# Get metadata with default values for missing fields
	product_title = source_doc.metadata.get('product_title', 'N/A')
	cpu = source_doc.metadata.get('cpu', 'Not specified')
	memory = source_doc.metadata.get('memory', 'Not specified')

	formatted_response = (
	f"Product: {product_title}\n"
	f"CPU: {cpu}\n"
	f"Memory: {memory}\n\n"
	f"Specifications:\n{source_doc.page_content}"
	)
	return formatted_response
	return source_doc.page_content

	except Exception as e:
	# Fallback to returning just the page content if there's any error
	print(f"Error formatting response: {str(e)}")
	return source_doc.page_content

	def process_query(qa_chain, message: str, history: List) -> Tuple[str, List[str]]:
	"""Process user query with enhanced context handling"""
	try:
	# Add requirement analysis to user query
	enhanced_query = f"""Analyze the following microcontroller requirements and provide detailed recommendations:

	User Requirements: {message}

	Please consider:
	1. Core specifications and performance requirements
	2. Memory requirements and constraints
	3. Communication interfaces needed
	4. Peripheral requirements
	5. Power and operating conditions
	6. Physical and environmental constraints

	Provide a detailed comparison of the best matching microcontrollers."""

	response = qa_chain({
	"question": enhanced_query,
	"chat_history": [(hist[0], hist[1]) for hist in history]
	})

	sources = response["source_documents"][:3]
	source_contents = [format_mc_response(source) for source in sources]

	return response["answer"], source_contents

	except Exception as e:
	return f"Error processing query: {str(e)}", []


	def create_interface():
	"""Create a Gradio interface with improved horizontal alignment and block sizes."""
	with gr.Blocks(css="""
	#main-title {
	color: #00509e;
	font-family: 'Arial', sans-serif;
	text-align: center;
	margin-bottom: 20px;
	}
	#description {
	color: #333;
	font-family: 'Arial', sans-serif;
	text-align: center;
	margin-bottom: 30px;
	}
	#initialize-btn {
	background-color: #00509e;
	color: white;
	border: none;
	padding: 5px 15px;
	font-size: 14px;
	}
	#initialize-btn:hover {
	background-color: #003f7f;
	}
	.gradio-row {
	margin-bottom: 20px;
	}
	""") as demo:
	# Title and description
	gr.HTML("<h1 id='main-title'>Microcontroller Selection Assistant</h1>")
	gr.HTML("<p id='description'>Select a sample file or upload your database. Then describe your requirements for tailored recommendations.</p>")

	# File selection section (sample and upload)
	with gr.Row(elem_id="file-section", equal_height=True):
	with gr.Column(scale=1):
	sample_file = gr.Dropdown(
	label="Sample Files",
	choices=["test_data.xlsx"],
	value="test_data.xlsx"
	)
	with gr.Column(scale=1):
	excel_file = gr.File(
	label="Upload Microcontroller Database (Excel)",
	file_types=[".xlsx", ".xls"],
	)

	# Initialization button and status
	with gr.Row(equal_height=True):
	initialize_btn = gr.Button("Initialize System", elem_id="initialize-btn")
	status = gr.Textbox(label="Status", value="Not initialized", interactive=False)

	# Chat section
	with gr.Row(equal_height=True):
	chatbot = gr.Chatbot(label="Chat", height=400)

	# Query input and buttons
	with gr.Row(equal_height=True):
	query = gr.Textbox(
	placeholder="Describe your microcontroller requirements (e.g., '32-bit MCU with USB support and 256KB flash memory')",
	label="Query",
	lines=3
	)

	with gr.Row(equal_height=True):
	submit_btn = gr.Button("Submit Query")
	clear_btn = gr.Button("Clear Chat")

	# State handlers
	vector_db_state = gr.State()
	qa_chain_state = gr.State()

	def init_system(file, sample):
	if not file and not sample:
	return None, None, "Please upload an Excel file or select a sample."

	file_path = file.name if file else sample

	docs, error = process_mc_excel(file_path) # Pass Excel file path here
	if error:
	return None, None, error

	vector_db = create_vector_db(docs)
	if not vector_db:
	return None, None, "Failed to create vector database."

	qa_chain = initialize_llm_chain(vector_db)
	if not qa_chain:
	return None, None, "Failed to initialize LLM chain."

	return vector_db, qa_chain, "System initialized successfully!"

	def handle_query(qa_chain, message, history):
	if qa_chain is None:
	return history + [("Error", "Please initialize the system first.")], ""

	answer, sources = process_query(qa_chain, message, history)

	# Include sources in the answer
	if sources:
	answer += "\n\nRelevant Products:\n" + "\n\n".join(sources)

	return history + [(message, answer)], ""

	# Button actions
	initialize_btn.click(
	init_system,
	inputs=[excel_file, sample_file],
	outputs=[vector_db_state, qa_chain_state, status]
	)

	submit_btn.click(
	handle_query,
	inputs=[qa_chain_state, query, chatbot],
	outputs=[chatbot, query]
	)

	clear_btn.click(
	lambda: ([], ""),
	inputs=[],
	outputs=[chatbot, query]
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(debug=True)