MatrixOne Python SDK 适配示例
MatrixOne Python SDK 适配示例
MatrixOne Python SDK 提供了一种高效、便捷的方式,帮助开发者快速实现基于向量的语义搜索、全文搜索及混合检索等场景。通过 SDK,用户可以轻松地将文本数据转化为向量形式,存储至 MatrixOne 数据库,并进行高性能的数据检索和管理,从而实现智能化的信息检索与知识管理应用。
MatrixOne Python SDK 提供了与 MatrixOne 数据库交互的便捷方式,支持向量嵌入、全文搜索以及混合查询等功能。本 SDK 基于 mo_vector.client.MoVectorClient 类实现,能够高效地存储、检索和管理文本及其向量表示。
开始前准备
在你开始之前,确认你已经下载安装了如下软件:
-
MatrixOne 单机部署 确保已完成单机部署 MatrixOne。
-
Python 3.8 或更高版本 下载并安装 Python,然后通过以下命令验证安装:
python3 -V
- 依赖库安装 运行以下命令安装必要的 Python 库:
pip install sentence-transformers mo-vector
- 环境配置 由于国内无法访问 huggingface,所以在无代理的情况下需要使用镜像网站下载所需模型,请留意,该下载操作需要在命令行终端中进行,而不能在 PyCharm 等集成开发环境(IDE)中直接运行脚本,因为后者可能仍然尝试访问原始网站来获取模型。
export HF_ENDPOINT=https://hf-mirror.com
快速入门
以下示例演示了如何使用 MatrixOne Python SDK 进行向量嵌入、全文搜索和混合查询。
import os
from mo_vector.client import MoVectorClient
from sentence_transformers import SentenceTransformer
#from dotenv import load_dotenv
# Step 1. 初始化嵌入模型
## 首先,加载一个预训练的句子嵌入模型(如 `msmarco-MiniLM-L12-cos-v5`)
print("Downloading and loading the embedding model...")
embed_model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5", trust_remote_code=True)
embed_model_dims = embed_model.get_sentence_embedding_dimension()
def text_to_embedding(text):
"""将文本转换为向量嵌入"""
embedding = embed_model.encode(text)
return embedding.tolist()
# Step 2. 初始化 MoVectorClient
##连接到 MatrixOne 数据库并创建一个向量存储表:
username = "root",
password = "111",
host = "127.0.0.1",
port = "6001",
database = "db1",
connection_string = f"mysql+pymysql://{username[0]}:{password[0]}@{host[0]}:{int(port[0])}/{database[0]}"
vector_store = MoVectorClient(
# The table which will store the vector data.
table_name='embedded_documents',
connection_string=connection_string,
# The dimension of the vector generated by the embedding model.
vector_dimension=embed_model_dims,
# Determine whether to recreate the table if it already exists.
drop_existing_table=True,
)
# Step 3. 批量插入文档和向量
## 插入带有元数据的文档及其向量嵌入:
documents = [
{
"id": "f8e7dee2-63b6-42f1-8b60-2d46710c1971",
"text": "dog",
"embedding": text_to_embedding("dog"),
"metadata": {"category": "animal"},
},
{
"id": "f8e7dee2-63b6-42f1-8b60-2d46710c1972",
"text": "hot dog",
"embedding": text_to_embedding("hot dog"),
"metadata": {"category": "food"},
},
{
"id": "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6",
"text": "fish",
"embedding": text_to_embedding("fish"),
"metadata": {"category": "animal"},
},
{
"id": "e4991349-d00b-485c-a481-f61695f2b5ae",
"text": "tree",
"embedding": text_to_embedding("tree"),
"metadata": {"category": "plant"},
},
]
vector_store.insert(
ids=[doc["id"] for doc in documents],
texts=[doc["text"] for doc in documents],
embeddings=[doc["embedding"] for doc in documents],
metadatas=[doc["metadata"] for doc in documents],
)
# Step 4. 向量搜索
def print_result(query, result):
print(f"Search result (\"{query}\"):")
for r in result:
print(f"- text: \"{r.document}\", distance: {r.distance}")
print("-----------------------------")
## 查询语义相似的文档
query = "a swimming animal"
query_embedding = text_to_embedding(query)
print(f"vector query")
## 基本查询
search_result = vector_store.query(query_embedding, k=3)
print_result(query, search_result)
print(f"vector query with meta filter 'category=animal'")
## 带元数据过滤的查询
search_result = vector_store.query(query_embedding, k=3, filter={"category": "\"animal\""})
print_result(query, search_result)
print(f"vector query, distance in range [1.0, 1.2]")
search_result = vector_store.query(query_embedding, k=3, dis_lower_bound=1.0, dis_upper_bound=1.2)
print_result(query, search_result)
query1 = "a plant"
query_embedding1 = text_to_embedding(query1)
print(f"batch vector query")
## 批量查询
search_results = vector_store.batch_query([query_embedding, query_embedding1], k=3)
for q, search_result in zip([query, query1], search_results):
print_result(q, search_result)
# Step 5. 全文搜索
def print_full_text_result(keywords, result):
print(f"Search result (keywords: {keywords}\"):")
for r in result:
print(f"- text: \"{r.document}\", score: {r.distance}")
print("-----------------------------")
## 启用全文索引并执行关键词搜索:
vector_store.create_full_text_index()
keywords = ["dog"]
print(f"full text query")
search_result = vector_store.full_text_query(keywords, k=3)
print_full_text_result(keywords, search_result)
print(f"full text query with meta filter 'category=\"animal\"'")
search_result = vector_store.full_text_query(keywords, k=3, filter={"category": {"$eq": "\"animal\""}})
print_full_text_result(keywords, search_result)
print(f"full text query with meta filter 'category like %animal%'")
search_result = vector_store.full_text_query(keywords, k=3, filter={"category": {"$like": "%animal%"}})
print_full_text_result(keywords, search_result)
print(f"full text query with meta filter 'category like \"anim%'")
search_result = vector_store.full_text_query(keywords, k=3, filter={"category": {"$like": "\"anim%"}})
print_full_text_result(keywords, search_result)
print(f"full text query with meta filter 'category like \"anim'")
search_result = vector_store.full_text_query(keywords, k=3, filter={"category": {"$like": "\"anim"}})
print_full_text_result(keywords, search_result)
# Step 6. 混合查询(向量 + 全文)
def print_mix_result(query, keywords, result):
print(f"Search result (query: \"{query}, keywords: {keywords}\"):")
for r in result:
print(f"- text: \"{r[1]}\", score: {r[0]}")
print("-----------------------------")
## rrf
print(f"mix query with rrf")
rerank_option_rrf = {"rerank_type": "RRF", "rank_value": 60}
search_result = vector_store.mix_query(query_embedding, keywords, rerank_option_rrf, k=3)
print_mix_result(query, keywords, search_result)
## weighted
print(f"mix query using weighted")
rerank_option_weighted = {"rerank_type": "WeightedRank", "weighted_score": [0.8, 0.2], "rerank_score_threshold": 0}
search_result = vector_store.mix_query(query_embedding, keywords, rerank_option_weighted, k=4)
print_mix_result(query, keywords, search_result)
print(f"mix query using weighted (bigger threshold)")
## 向量权重 80%,全文权重 20%,仅返回分数高于 0.3 的结果
rerank_option_weighted = {"rerank_type": "WeightedRank", "weighted_score": [0.8, 0.2], "rerank_score_threshold": 0.3}
search_result = vector_store.mix_query(query_embedding, keywords, rerank_option_weighted, k=4)
print_mix_result(query, keywords, search_result)
print(f"mix query using weighted with filter")
rerank_option_weighted = {"rerank_type": "WeightedRank", "weighted_score": [0.8, 0.2], "rerank_score_threshold": 0}
search_result = vector_store.mix_query(query_embedding, keywords, rerank_option_weighted, k=4, filter={"category": "\"animal\""})
print_mix_result(query, keywords, search_result)
# Step 7. 删除数据
vector_store.delete(ids=[doc["id"] for doc in documents])
返回示例:
Downloading and loading the embedding model...
vector query
Search result ("a swimming animal"):
- text: "fish", distance: 0.9552924506129786
- text: "dog", distance: 1.13748271444737
- text: "hot dog", distance: 1.2123988019087684
-----------------------------
vector query with meta filter 'category=animal'
Search result ("a swimming animal"):
- text: "fish", distance: 0.9552924506129786
- text: "dog", distance: 1.13748271444737
-----------------------------
vector query, distance in range [1.0, 1.2]
Search result ("a swimming animal"):
- text: "dog", distance: 1.13748271444737
-----------------------------
batch vector query
Search result ("a swimming animal"):
- text: "fish", distance: 0.9552924506129786
- text: "dog", distance: 1.13748271444737
- text: "hot dog", distance: 1.2123988019087684
-----------------------------
Search result ("a plant"):
- text: "tree", distance: 1.0239213693424032
- text: "dog", distance: 1.1365387681557133
- text: "fish", distance: 1.1681793018342743
-----------------------------
full text query
Search result (keywords: ['dog']"):
- text: "dog", score: 0.09061906
- text: "hot dog", score: 0.09061906
-----------------------------
full text query with meta filter 'category="animal"'
Search result (keywords: ['dog']"):
- text: "dog", score: 0.09061906
-----------------------------
full text query with meta filter 'category like %animal%'
Search result (keywords: ['dog']"):
- text: "dog", score: 0.09061906
-----------------------------
full text query with meta filter 'category like "anim%'
Search result (keywords: ['dog']"):
- text: "dog", score: 0.09061906
-----------------------------
full text query with meta filter 'category like "anim'
Search result (keywords: ['dog']"):
-----------------------------
mix query with rrf
Search result (query: "a swimming animal, keywords: ['dog']"):
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1971', document='dog', metadata='{"category": "animal"}', distance=0.09061906)", score: 0.03252247488101534
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1972', document='hot dog', metadata='{"category": "food"}', distance=0.09061906)", score: 0.03200204813108039
- text: "QueryResult(id='8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6', document='fish', metadata={'category': 'animal'}, distance=0.9552924506129786)", score: 0.01639344262295082
-----------------------------
mix query using weighted
Search result (query: "a swimming animal, keywords: ['dog']"):
- text: "QueryResult(id='8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6', document='fish', metadata={'category': 'animal'}, distance=0.9552924506129786)", score: 0.8
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1971', document='dog', metadata='{"category": "animal"}', distance=0.09061906)", score: 0.6000000000000001
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1972', document='hot dog', metadata='{"category": "food"}', distance=0.09061906)", score: 0.3361337882406932
- text: "QueryResult(id='e4991349-d00b-485c-a481-f61695f2b5ae', document='tree', metadata={'category': 'plant'}, distance=1.2637605026033762)", score: 0.1638662117593067
-----------------------------
mix query using weighted (bigger threshold)
Search result (query: "a swimming animal, keywords: ['dog']"):
- text: "QueryResult(id='8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6', document='fish', metadata={'category': 'animal'}, distance=0.9552924506129786)", score: 0.8
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1971', document='dog', metadata='{"category": "animal"}', distance=0.09061906)", score: 0.6000000000000001
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1972', document='hot dog', metadata='{"category": "food"}', distance=0.09061906)", score: 0.3361337882406932
-----------------------------
mix query using weighted with filter
Search result (query: "a swimming animal, keywords: ['dog']"):
- text: "QueryResult(id='8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6', document='fish', metadata={'category': 'animal'}, distance=0.9552924506129786)", score: 0.8
- text: "QueryResult(id='f8e7dee2-63b6-42f1-8b60-2d46710c1971', document='dog', metadata='{"category": "animal"}', distance=0.09061906)", score: 0.6000000000000001
-----------------------------