1. 向量模型
from dotenv import load_dotenv
import os
# 加载环境变量
load_dotenv('llm.env')
# 1. 字节跳动火山引擎
# https://www.volcengine.com/
from langchain_openai import OpenAIEmbeddings
def demo01():
# 注意:参数 check_embedding_ctx_length 需要设置为 False, 否则会自动将输入的文本进行数值化
model = OpenAIEmbeddings(model='doubao-embedding-text-240715', check_embedding_ctx_length=False)
response = model.embed_documents(['你是谁?', '他又是谁?'])
print(response)
# 2. 阿里云百炼大模型
# https://bailian.console.aliyun.com/
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
def demo02():
bailian_api_base = os.environ['BAILIAN_API_BASE']
bailian_api_key = os.environ['BAILIAN_API_KEY']
model = OpenAIEmbeddings(model='text-embedding-v3', api_key=bailian_api_key, openai_api_base=bailian_api_base, check_embedding_ctx_length=False)
response = model.embed_documents(['你是谁?', '他又是谁?'])
print(response)
# 3. 使用 jina 模型
# jinaai: https://jina.ai/
from langchain_community.embeddings import JinaEmbeddings
def demo03():
model = JinaEmbeddings(model_name='jina-embeddings-v3')
response = model.embed_documents(['你是谁?', '他又是谁?'])
print(response)
# 4. 使用 ollama 本地模型
from langchain_ollama import OllamaEmbeddings
def demo04():
model = OllamaEmbeddings(model='nomic-embed-text:latest')
response = model.embed_documents(['你是谁?', '他又是谁?'])
print(response)
if __name__ == '__main__':
demo01()
demo02()
demo03()
demo04()
2. 向量操作
from dotenv import load_dotenv
load_dotenv('llm.env')
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
# pip install faiss-cpu
import faiss
def demo01():
# 注意:参数 check_embedding_ctx_length 需要设置为 False, 否则会自动将输入的文本进行数值化
vector_model = OpenAIEmbeddings(model='doubao-embedding-text-240515', check_embedding_ctx_length=False)
documents = [Document(page_content='你好,世界。'), Document(page_content='你好,AI。')]
# 创建索引
vector_store = FAISS(
# 设置文本向量模型
embedding_function=vector_model,
# 设置索引类型和向量维度
index=faiss.IndexFlatIP(2048),
# 文档存储到内存中
docstore= InMemoryDocstore(),
# 设置相似度度量方法
# distance_strategy=DistanceStrategy.COSINE,
index_to_docstore_id={},
)
# 添加文档
vector_store.add_documents(documents, ids=['1', '2'])
# 存储向量
vector_store.save_local('faiss.index')
# 加载向量
vector_store = FAISS.load_local('faiss.index', embeddings=vector_model, allow_dangerous_deserialization=True)
# 删除文档
# vector_store.delete(['1'])
# 查询文档,只返回文档
results = vector_store.similarity_search(query='hello, world', k=1)
print(results)
# 查询文档,返回相关分数
results = vector_store.similarity_search_with_score(query='hello, world', k=1)
print(results)
# 转换为检索器对象
retriever = vector_store.as_retriever(search_type='mmr', search_kwargs={'k': 1})
results = retriever.invoke('hello, world')
print(results)
from langchain_chroma import Chroma
def demo02():
# 1. 支持 server/client 模式(可连接外部 Chroma server,也支持本地内嵌模式)
# 2. 支持更新文档(实际是基于 id 的删除后再添加)
# 3. 不需要显示设置向量维度(自动根据嵌入模型推断)
vector_model = OpenAIEmbeddings(model='doubao-embedding-text-240515', check_embedding_ctx_length=False)
vector_store = Chroma(collection_name='example_collection',
# 设置文本向量模型
embedding_function=vector_model,
# 向量和文档存储目录
persist_directory='chroma_langchain_db')
documents = [Document(page_content='你好,世界。'), Document(page_content='你好,AI。')]
vector_store.add_documents(documents=documents, ids=['1', '2'])
# 更新文档
vector_store.update_documents(ids=['1',], documents=[Document(page_content='你好,人工智能。', metadata={'source': ''})])
results = vector_store.similarity_search('hello AI', k=2)
print(results)
if __name__ == '__main__':
demo01()
demo02()