311 lines
11 KiB
PL/PgSQL
311 lines
11 KiB
PL/PgSQL
-- Meeting Intelligence System - PostgreSQL Schema
|
|
-- Uses pgvector extension for semantic search
|
|
|
|
-- Enable required extensions
|
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
|
CREATE EXTENSION IF NOT EXISTS "vector";
|
|
|
|
-- ============================================================
|
|
-- Meetings Table
|
|
-- ============================================================
|
|
CREATE TABLE meetings (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
conference_id VARCHAR(255) NOT NULL,
|
|
conference_name VARCHAR(255),
|
|
title VARCHAR(500),
|
|
started_at TIMESTAMP WITH TIME ZONE,
|
|
ended_at TIMESTAMP WITH TIME ZONE,
|
|
duration_seconds INTEGER,
|
|
recording_path VARCHAR(1000),
|
|
audio_path VARCHAR(1000),
|
|
status VARCHAR(50) DEFAULT 'recording',
|
|
-- Status: 'recording', 'extracting_audio', 'transcribing', 'diarizing', 'summarizing', 'ready', 'failed'
|
|
error_message TEXT,
|
|
metadata JSONB DEFAULT '{}',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_meetings_conference_id ON meetings(conference_id);
|
|
CREATE INDEX idx_meetings_status ON meetings(status);
|
|
CREATE INDEX idx_meetings_started_at ON meetings(started_at DESC);
|
|
CREATE INDEX idx_meetings_created_at ON meetings(created_at DESC);
|
|
|
|
-- ============================================================
|
|
-- Meeting Participants
|
|
-- ============================================================
|
|
CREATE TABLE meeting_participants (
|
|
id SERIAL PRIMARY KEY,
|
|
meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE,
|
|
participant_id VARCHAR(255) NOT NULL,
|
|
display_name VARCHAR(255),
|
|
email VARCHAR(255),
|
|
joined_at TIMESTAMP WITH TIME ZONE,
|
|
left_at TIMESTAMP WITH TIME ZONE,
|
|
duration_seconds INTEGER,
|
|
is_moderator BOOLEAN DEFAULT FALSE,
|
|
metadata JSONB DEFAULT '{}',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_participants_meeting_id ON meeting_participants(meeting_id);
|
|
CREATE INDEX idx_participants_participant_id ON meeting_participants(participant_id);
|
|
|
|
-- ============================================================
|
|
-- Transcripts
|
|
-- ============================================================
|
|
CREATE TABLE transcripts (
|
|
id SERIAL PRIMARY KEY,
|
|
meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE,
|
|
segment_index INTEGER NOT NULL,
|
|
start_time FLOAT NOT NULL,
|
|
end_time FLOAT NOT NULL,
|
|
speaker_id VARCHAR(255),
|
|
speaker_name VARCHAR(255),
|
|
speaker_label VARCHAR(50), -- e.g., "Speaker 1", "Speaker 2"
|
|
text TEXT NOT NULL,
|
|
confidence FLOAT,
|
|
language VARCHAR(10) DEFAULT 'en',
|
|
word_timestamps JSONB, -- Array of {word, start, end, confidence}
|
|
metadata JSONB DEFAULT '{}',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_transcripts_meeting_id ON transcripts(meeting_id);
|
|
CREATE INDEX idx_transcripts_speaker_id ON transcripts(speaker_id);
|
|
CREATE INDEX idx_transcripts_start_time ON transcripts(meeting_id, start_time);
|
|
CREATE INDEX idx_transcripts_text_search ON transcripts USING gin(to_tsvector('english', text));
|
|
|
|
-- ============================================================
|
|
-- Transcript Embeddings (for semantic search)
|
|
-- ============================================================
|
|
CREATE TABLE transcript_embeddings (
|
|
id SERIAL PRIMARY KEY,
|
|
transcript_id INTEGER NOT NULL REFERENCES transcripts(id) ON DELETE CASCADE,
|
|
meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE,
|
|
embedding vector(384), -- all-MiniLM-L6-v2 dimensions
|
|
chunk_text TEXT, -- The text chunk this embedding represents
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_embeddings_transcript_id ON transcript_embeddings(transcript_id);
|
|
CREATE INDEX idx_embeddings_meeting_id ON transcript_embeddings(meeting_id);
|
|
CREATE INDEX idx_embeddings_vector ON transcript_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
|
|
|
-- ============================================================
|
|
-- AI Summaries
|
|
-- ============================================================
|
|
CREATE TABLE summaries (
|
|
id SERIAL PRIMARY KEY,
|
|
meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE,
|
|
summary_text TEXT,
|
|
key_points JSONB, -- Array of key point strings
|
|
action_items JSONB, -- Array of {task, assignee, due_date, completed}
|
|
decisions JSONB, -- Array of decision strings
|
|
topics JSONB, -- Array of {topic, duration_seconds, relevance_score}
|
|
sentiment VARCHAR(50), -- 'positive', 'neutral', 'negative', 'mixed'
|
|
sentiment_scores JSONB, -- {positive: 0.7, neutral: 0.2, negative: 0.1}
|
|
participants_summary JSONB, -- {participant_id: {speaking_time, word_count, topics}}
|
|
model_used VARCHAR(100),
|
|
prompt_tokens INTEGER,
|
|
completion_tokens INTEGER,
|
|
generated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
metadata JSONB DEFAULT '{}'
|
|
);
|
|
|
|
CREATE INDEX idx_summaries_meeting_id ON summaries(meeting_id);
|
|
CREATE INDEX idx_summaries_generated_at ON summaries(generated_at DESC);
|
|
|
|
-- ============================================================
|
|
-- Processing Jobs Queue
|
|
-- ============================================================
|
|
CREATE TABLE processing_jobs (
|
|
id SERIAL PRIMARY KEY,
|
|
meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE,
|
|
job_type VARCHAR(50) NOT NULL, -- 'extract_audio', 'transcribe', 'diarize', 'summarize', 'embed'
|
|
status VARCHAR(50) DEFAULT 'pending', -- 'pending', 'processing', 'completed', 'failed', 'cancelled'
|
|
priority INTEGER DEFAULT 5, -- 1 = highest, 10 = lowest
|
|
attempts INTEGER DEFAULT 0,
|
|
max_attempts INTEGER DEFAULT 3,
|
|
started_at TIMESTAMP WITH TIME ZONE,
|
|
completed_at TIMESTAMP WITH TIME ZONE,
|
|
error_message TEXT,
|
|
result JSONB,
|
|
worker_id VARCHAR(100),
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_jobs_meeting_id ON processing_jobs(meeting_id);
|
|
CREATE INDEX idx_jobs_status ON processing_jobs(status, priority, created_at);
|
|
CREATE INDEX idx_jobs_type_status ON processing_jobs(job_type, status);
|
|
|
|
-- ============================================================
|
|
-- Search History (for analytics)
|
|
-- ============================================================
|
|
CREATE TABLE search_history (
|
|
id SERIAL PRIMARY KEY,
|
|
user_id VARCHAR(255),
|
|
query TEXT NOT NULL,
|
|
search_type VARCHAR(50), -- 'text', 'semantic', 'combined'
|
|
results_count INTEGER,
|
|
meeting_ids UUID[],
|
|
filters JSONB,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_search_history_created_at ON search_history(created_at DESC);
|
|
|
|
-- ============================================================
|
|
-- Webhook Events (for Jibri callbacks)
|
|
-- ============================================================
|
|
CREATE TABLE webhook_events (
|
|
id SERIAL PRIMARY KEY,
|
|
event_type VARCHAR(100) NOT NULL,
|
|
payload JSONB NOT NULL,
|
|
processed BOOLEAN DEFAULT FALSE,
|
|
processed_at TIMESTAMP WITH TIME ZONE,
|
|
error_message TEXT,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_webhooks_processed ON webhook_events(processed, created_at);
|
|
|
|
-- ============================================================
|
|
-- Functions
|
|
-- ============================================================
|
|
|
|
-- Update timestamp trigger
|
|
CREATE OR REPLACE FUNCTION update_updated_at()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
NEW.updated_at = NOW();
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER meetings_updated_at
|
|
BEFORE UPDATE ON meetings
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_updated_at();
|
|
|
|
CREATE TRIGGER jobs_updated_at
|
|
BEFORE UPDATE ON processing_jobs
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_updated_at();
|
|
|
|
-- Semantic search function
|
|
CREATE OR REPLACE FUNCTION semantic_search(
|
|
query_embedding vector(384),
|
|
match_threshold FLOAT DEFAULT 0.7,
|
|
match_count INT DEFAULT 10,
|
|
meeting_filter UUID DEFAULT NULL
|
|
)
|
|
RETURNS TABLE (
|
|
transcript_id INT,
|
|
meeting_id UUID,
|
|
chunk_text TEXT,
|
|
similarity FLOAT
|
|
) AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
SELECT
|
|
te.transcript_id,
|
|
te.meeting_id,
|
|
te.chunk_text,
|
|
1 - (te.embedding <=> query_embedding) AS similarity
|
|
FROM transcript_embeddings te
|
|
WHERE
|
|
(meeting_filter IS NULL OR te.meeting_id = meeting_filter)
|
|
AND 1 - (te.embedding <=> query_embedding) > match_threshold
|
|
ORDER BY te.embedding <=> query_embedding
|
|
LIMIT match_count;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Full-text search function
|
|
CREATE OR REPLACE FUNCTION fulltext_search(
|
|
search_query TEXT,
|
|
meeting_filter UUID DEFAULT NULL,
|
|
match_count INT DEFAULT 50
|
|
)
|
|
RETURNS TABLE (
|
|
transcript_id INT,
|
|
meeting_id UUID,
|
|
text TEXT,
|
|
speaker_name VARCHAR,
|
|
start_time FLOAT,
|
|
rank FLOAT
|
|
) AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
SELECT
|
|
t.id,
|
|
t.meeting_id,
|
|
t.text,
|
|
t.speaker_name,
|
|
t.start_time,
|
|
ts_rank(to_tsvector('english', t.text), plainto_tsquery('english', search_query)) AS rank
|
|
FROM transcripts t
|
|
WHERE
|
|
(meeting_filter IS NULL OR t.meeting_id = meeting_filter)
|
|
AND to_tsvector('english', t.text) @@ plainto_tsquery('english', search_query)
|
|
ORDER BY rank DESC
|
|
LIMIT match_count;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- ============================================================
|
|
-- Views
|
|
-- ============================================================
|
|
|
|
-- Meeting overview with stats
|
|
CREATE VIEW meeting_overview AS
|
|
SELECT
|
|
m.id,
|
|
m.conference_id,
|
|
m.conference_name,
|
|
m.title,
|
|
m.started_at,
|
|
m.ended_at,
|
|
m.duration_seconds,
|
|
m.status,
|
|
m.recording_path,
|
|
COUNT(DISTINCT mp.id) AS participant_count,
|
|
COUNT(DISTINCT t.id) AS transcript_segment_count,
|
|
COALESCE(SUM(LENGTH(t.text)), 0) AS total_characters,
|
|
s.id IS NOT NULL AS has_summary,
|
|
m.created_at
|
|
FROM meetings m
|
|
LEFT JOIN meeting_participants mp ON m.id = mp.meeting_id
|
|
LEFT JOIN transcripts t ON m.id = t.meeting_id
|
|
LEFT JOIN summaries s ON m.id = s.meeting_id
|
|
GROUP BY m.id, s.id;
|
|
|
|
-- Speaker stats per meeting
|
|
CREATE VIEW speaker_stats AS
|
|
SELECT
|
|
t.meeting_id,
|
|
t.speaker_id,
|
|
t.speaker_name,
|
|
t.speaker_label,
|
|
COUNT(*) AS segment_count,
|
|
SUM(t.end_time - t.start_time) AS speaking_time_seconds,
|
|
SUM(LENGTH(t.text)) AS character_count,
|
|
SUM(array_length(regexp_split_to_array(t.text, '\s+'), 1)) AS word_count
|
|
FROM transcripts t
|
|
GROUP BY t.meeting_id, t.speaker_id, t.speaker_name, t.speaker_label;
|
|
|
|
-- ============================================================
|
|
-- Sample Data (for testing - remove in production)
|
|
-- ============================================================
|
|
|
|
-- INSERT INTO meetings (conference_id, conference_name, title, started_at, status)
|
|
-- VALUES ('test-room-123', 'Test Room', 'Test Meeting', NOW() - INTERVAL '1 hour', 'ready');
|
|
|
|
COMMENT ON TABLE meetings IS 'Stores meeting metadata and processing status';
|
|
COMMENT ON TABLE transcripts IS 'Stores time-stamped transcript segments with speaker attribution';
|
|
COMMENT ON TABLE summaries IS 'Stores AI-generated meeting summaries and extracted information';
|
|
COMMENT ON TABLE transcript_embeddings IS 'Stores vector embeddings for semantic search';
|
|
COMMENT ON TABLE processing_jobs IS 'Job queue for async processing tasks';
|