키워드 검색 vs 시맨틱 검색

구분	키워드(BM25)	시맨틱(Vector)	하이브리드
검색 방식	단어 일치	의미 유사도	둘 다
"파이썬 배우기" → "Python 강좌"	❌ 불일치	✅ 유사	✅
정확한 키워드 검색	✅ 우수	❌ 약함	✅
오타 처리	❌ 약함	✅ 강함	✅
속도	매우 빠름	느림	중간

아키텍처

mermaid


flowchart TD
    Query[사용자 쿼리] --> Parallel{병렬 검색}
    Parallel --> BM25[BM25 키워드 검색<br/>Elasticsearch]
    Parallel --> Vec[벡터 시맨틱 검색<br/>pgvector]
    BM25 --> Merge[결과 병합<br/>RRF 알고리즘]
    Vec --> Merge
    Merge --> Rerank[Cohere Rerank<br/>재정렬]
    Rerank --> Results[최종 검색 결과]

pgvector +

BM25 하이브리드 검색

python

import psycopg2
from openai import OpenAI
from rank_bm25 import BM25Okapi
import numpy as np

client = OpenAI()

class HybridSearchEngine:
    def __init__(self, conn):
        self.conn = conn
        self.documents = []
        self.bm25 = None

    def add_documents(self, docs: list[dict]):
        self.documents = docs

        # BM25 인덱스 구축
        tokenized = [doc["content"].split() for doc in docs]
        self.bm25 = BM25Okapi(tokenized)

        # 벡터 임베딩 저장
        contents = [doc["content"] for doc in docs]
        response = client.embeddings.create(
            input=contents,
            model="text-embedding-3-small"
        )

        with self.conn.cursor() as cur:
            for i, (doc, emb_data) in enumerate(zip(docs, response.data)):
                cur.execute(
                    '''INSERT INTO search_index (id, content, metadata, embedding)
                       VALUES (%s, %s, %s, %s)
                       ON CONFLICT (id) DO UPDATE
                       SET content=EXCLUDED.content, embedding=EXCLUDED.embedding''',
                    (doc["id"], doc["content"], doc.get("metadata", {}), emb_data.embedding)
                )
        self.conn.commit()

    def search(self, query: str, top_k: int = 10, bm25_weight: float = 0.3) -> list:
        # 1. BM25 점수
        bm25_scores = self.bm25.get_scores(query.split())
        bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-8)

        # 2. 벡터 유사도
        query_vec = client.embeddings.create(
            input=[query], model="text-embedding-3-small"
        ).data[0].embedding

        with self.conn.cursor() as cur:
            cur.execute(
                '''SELECT id, 1 - (embedding <=> %s::vector) AS score
                   FROM search_index
                   ORDER BY embedding <=> %s::vector
                   LIMIT %s''',
                (query_vec, query_vec, top_k * 2)
            )
            vec_results = {row[0]: row[1] for row in cur.fetchall()}

        # 3. 점수 융합 (가중 합산)
        combined = []
        for i, doc in enumerate(self.documents):
            vec_score = vec_results.get(doc["id"], 0)
            combined_score = bm25_weight * bm25_scores[i] + (1 - bm25_weight) * vec_score
            combined.append((doc, combined_score))

        combined.sort(key=lambda x: x[1], reverse=True)
        return combined[:top_k]

RRF (Reciprocal Rank Fusion)

두 결과 목록을 더 공정하게 합치는 알고리즘:

python

def reciprocal_rank_fusion(rankings: list[list], k: int = 60) -> list:
    '''
    rankings: 각 검색 방법의 문서 ID 순위 리스트
    k: 완화 상수 (기본값 60)
    '''
    scores = {}
    for ranking in rankings:
        for rank, doc_id in enumerate(ranking):
            scores[doc_id] = scores.get(doc_id, 0) + 1 / (k + rank + 1)

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# 사용
bm25_ranking = ["doc3", "doc1", "doc7", "doc2"]
vec_ranking = ["doc1", "doc5", "doc3", "doc8"]

fused = reciprocal_rank_fusion([bm25_ranking, vec_ranking])
print(fused[:3])  # [('doc3', 0.0317...), ('doc1', 0.0313...), ...]

Typesense:

빠른 오픈소스 대안

python

import typesense

client = typesense.Client({
    "api_key": "your-api-key",
    "nodes": [{"host": "localhost", "port": "8108", "protocol": "http"}],
    "connection_timeout_seconds": 2
})

# 스키마 생성
client.collections.create({
    "name": "articles",
    "fields": [
        {"name": "id", "type": "string"},
        {"name": "title", "type": "string"},
        {"name": "content", "type": "string"},
        {"name": "embedding", "type": "float[]", "num_dim": 1536}  # 벡터 필드
    ]
})

# 하이브리드 검색 (키워드 + 벡터 동시)
results = client.collections["articles"].documents.search({
    "q": "파이썬 머신러닝",
    "query_by": "title,content",
    "vector_query": f"embedding:([{','.join(map(str, query_embedding))}], k:10)",
    "hybrid_search_weight": 0.7,  # 벡터 검색 가중치
    "per_page": 10
})

Cohere Rerank으로 최종 정렬

python

import cohere

co = cohere.Client("your-api-key")

def rerank_results(query: str, documents: list[str], top_n: int = 5) -> list:
    results = co.rerank(
        model="rerank-multilingual-v3.0",
        query=query,
        documents=documents,
        top_n=top_n,
    )
    return [(results.results[i].index, results.results[i].relevance_score)
            for i in range(len(results.results))]

# 하이브리드 검색 후 리랭킹
candidates = hybrid_search(query, top_k=20)
doc_texts = [c["content"] for c, _ in candidates]
reranked = rerank_results(query, doc_texts, top_n=5)

final_results = [candidates[idx] for idx, score in reranked]

검색 품질 평가

python

# NDCG (Normalized Discounted Cumulative Gain) 계산
def ndcg_at_k(relevance_scores: list, k: int) -> float:
    dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance_scores[:k]))
    ideal = sum(s / np.log2(i + 2) for i, s in enumerate(sorted(relevance_scores, reverse=True)[:k]))
    return dcg / ideal if ideal > 0 else 0

# A/B 테스트 프레임워크
results_a = keyword_search(query)
results_b = hybrid_search(query)

ndcg_a = ndcg_at_k([1 if doc in relevant_docs else 0 for doc, _ in results_a], k=5)
ndcg_b = ndcg_at_k([1 if doc in relevant_docs else 0 for doc, _ in results_b], k=5)

print(f"키워드 검색 NDCG@5: {ndcg_a:.4f}")
print(f"하이브리드 검색 NDCG@5: {ndcg_b:.4f}")

실제 서비스에서는 클릭 데이터, 체류 시간, 전환율을 오프라인 지표와 함께 측정하세요.

AI 기반 시맨틱 검색 엔진 구축하기: 하이브리드 검색 완전 가이드

키워드 검색 vs 시맨틱 검색

아키텍처

pgvector +

RRF (Reciprocal Rank Fusion)

Typesense:

Cohere Rerank으로 최종 정렬

검색 품질 평가

이 글에서 다루는 AI

관련 글 더 보기

댓글

관련 모델

관련 방법론