174 lines
4.9 KiB
Python
174 lines
4.9 KiB
Python
"""
|
|
Search routes for Meeting Intelligence.
|
|
"""
|
|
|
|
from typing import Optional, List
|
|
|
|
from fastapi import APIRouter, HTTPException, Request, Query
|
|
from pydantic import BaseModel
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from ..config import settings
|
|
|
|
import structlog
|
|
|
|
log = structlog.get_logger()
|
|
|
|
router = APIRouter()
|
|
|
|
# Lazy-load embedding model
|
|
_embedding_model = None
|
|
|
|
|
|
def get_embedding_model():
|
|
"""Get or initialize the embedding model."""
|
|
global _embedding_model
|
|
if _embedding_model is None:
|
|
log.info("Loading embedding model...", model=settings.embedding_model)
|
|
_embedding_model = SentenceTransformer(settings.embedding_model)
|
|
log.info("Embedding model loaded")
|
|
return _embedding_model
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
meeting_id: str
|
|
meeting_title: Optional[str]
|
|
text: str
|
|
start_time: Optional[float]
|
|
speaker_label: Optional[str]
|
|
score: float
|
|
search_type: str
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
query: str
|
|
results: List[SearchResult]
|
|
total: int
|
|
search_type: str
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
query: str
|
|
meeting_id: Optional[str] = None
|
|
search_type: str = "combined" # "text", "semantic", "combined"
|
|
limit: int = 20
|
|
|
|
|
|
@router.post("", response_model=SearchResponse)
|
|
async def search_transcripts(request: Request, body: SearchRequest):
|
|
"""Search across meeting transcripts.
|
|
|
|
Search types:
|
|
- text: Full-text search using PostgreSQL ts_vector
|
|
- semantic: Semantic search using vector embeddings
|
|
- combined: Both text and semantic search, merged results
|
|
"""
|
|
db = request.app.state.db
|
|
|
|
if not body.query or len(body.query.strip()) < 2:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Query must be at least 2 characters"
|
|
)
|
|
|
|
results = []
|
|
|
|
# Full-text search
|
|
if body.search_type in ["text", "combined"]:
|
|
text_results = await db.fulltext_search(
|
|
query=body.query,
|
|
meeting_id=body.meeting_id,
|
|
limit=body.limit
|
|
)
|
|
|
|
for r in text_results:
|
|
results.append(SearchResult(
|
|
meeting_id=str(r["meeting_id"]),
|
|
meeting_title=r.get("meeting_title"),
|
|
text=r["text"],
|
|
start_time=r.get("start_time"),
|
|
speaker_label=r.get("speaker_label"),
|
|
score=float(r["rank"]),
|
|
search_type="text"
|
|
))
|
|
|
|
# Semantic search
|
|
if body.search_type in ["semantic", "combined"]:
|
|
try:
|
|
model = get_embedding_model()
|
|
query_embedding = model.encode(body.query).tolist()
|
|
|
|
semantic_results = await db.semantic_search(
|
|
embedding=query_embedding,
|
|
meeting_id=body.meeting_id,
|
|
threshold=0.6,
|
|
limit=body.limit
|
|
)
|
|
|
|
for r in semantic_results:
|
|
results.append(SearchResult(
|
|
meeting_id=str(r["meeting_id"]),
|
|
meeting_title=r.get("meeting_title"),
|
|
text=r["chunk_text"],
|
|
start_time=r.get("start_time"),
|
|
speaker_label=r.get("speaker_label"),
|
|
score=float(r["similarity"]),
|
|
search_type="semantic"
|
|
))
|
|
|
|
except Exception as e:
|
|
log.error("Semantic search failed", error=str(e))
|
|
if body.search_type == "semantic":
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Semantic search failed: {str(e)}"
|
|
)
|
|
|
|
# Deduplicate and sort by score
|
|
seen = set()
|
|
unique_results = []
|
|
for r in sorted(results, key=lambda x: x.score, reverse=True):
|
|
key = (r.meeting_id, r.text[:100])
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_results.append(r)
|
|
|
|
return SearchResponse(
|
|
query=body.query,
|
|
results=unique_results[:body.limit],
|
|
total=len(unique_results),
|
|
search_type=body.search_type
|
|
)
|
|
|
|
|
|
@router.get("/suggest")
|
|
async def search_suggestions(
|
|
request: Request,
|
|
q: str = Query(..., min_length=2)
|
|
):
|
|
"""Get search suggestions based on partial query."""
|
|
db = request.app.state.db
|
|
|
|
# Simple prefix search on common terms
|
|
results = await db.fulltext_search(query=q, limit=5)
|
|
|
|
# Extract unique phrases
|
|
suggestions = []
|
|
for r in results:
|
|
# Get surrounding context
|
|
text = r["text"]
|
|
words = text.split()
|
|
|
|
# Find matching words and get context
|
|
for i, word in enumerate(words):
|
|
if q.lower() in word.lower():
|
|
start = max(0, i - 2)
|
|
end = min(len(words), i + 3)
|
|
phrase = " ".join(words[start:end])
|
|
if phrase not in suggestions:
|
|
suggestions.append(phrase)
|
|
if len(suggestions) >= 5:
|
|
break
|
|
|
|
return {"suggestions": suggestions}
|