1+ # 步骤一:安装依赖库
2+ from langchain_community .document_loaders import WebBaseLoader
3+ from langchain .text_splitter import RecursiveCharacterTextSplitter
4+ from langchain_community .embeddings import DashScopeEmbeddings
5+ from pymilvus import MilvusClient , DataType , Function , FunctionType
6+
7+ dashscope_api_key = "<YOUR_DASHSCOPE_API_KEY>"
8+ milvus_url = "<YOUR_MMILVUS_URL>"
9+ user_name = "root"
10+ password = "<YOUR_PASSWORD>"
11+ collection_name = "milvus_overview"
12+ dense_dim = 1536
13+
14+ # 步骤二:数据准备
15+ loader = WebBaseLoader ([
16+ 'https://raw.githubusercontent.com/milvus-io/milvus-docs/refs/heads/v2.5.x/site/en/about/overview.md'
17+ ])
18+
19+ docs = loader .load ()
20+
21+ text_splitter = RecursiveCharacterTextSplitter (chunk_size = 1024 , chunk_overlap = 256 )
22+
23+ # 使用LangChain将输入文档安照chunk_size切分
24+ all_splits = text_splitter .split_documents (docs )
25+
26+ embeddings = DashScopeEmbeddings (
27+ model = "text-embedding-v2" , dashscope_api_key = dashscope_api_key
28+ )
29+
30+ text_contents = [doc .page_content for doc in all_splits ]
31+
32+ vectors = embeddings .embed_documents (text_contents )
33+
34+
35+ client = MilvusClient (
36+ uri = f"http://{ milvus_url } :19530" ,
37+ token = f"{ user_name } :{ password } " ,
38+ )
39+
40+ schema = MilvusClient .create_schema (
41+ enable_dynamic_field = True ,
42+ )
43+
44+ analyzer_params = {
45+ "type" : "english"
46+ }
47+
48+ # Add fields to schema
49+ schema .add_field (field_name = "id" , datatype = DataType .INT64 , is_primary = True , auto_id = True )
50+ schema .add_field (field_name = "text" , datatype = DataType .VARCHAR , max_length = 65535 , enable_analyzer = True , analyzer_params = analyzer_params , enable_match = True )
51+ schema .add_field (field_name = "sparse_bm25" , datatype = DataType .SPARSE_FLOAT_VECTOR )
52+ schema .add_field (field_name = "dense" , datatype = DataType .FLOAT_VECTOR , dim = dense_dim )
53+
54+ bm25_function = Function (
55+ name = "bm25" ,
56+ function_type = FunctionType .BM25 ,
57+ input_field_names = ["text" ],
58+ output_field_names = "sparse_bm25" ,
59+ )
60+ schema .add_function (bm25_function )
61+
62+ index_params = client .prepare_index_params ()
63+
64+ # Add indexes
65+ index_params .add_index (
66+ field_name = "dense" ,
67+ index_name = "dense_index" ,
68+ index_type = "IVF_FLAT" ,
69+ metric_type = "IP" ,
70+ params = {"nlist" : 128 },
71+ )
72+
73+ index_params .add_index (
74+ field_name = "sparse_bm25" ,
75+ index_name = "sparse_bm25_index" ,
76+ index_type = "SPARSE_WAND" ,
77+ metric_type = "BM25"
78+ )
79+
80+ # Create collection
81+ client .create_collection (
82+ collection_name = collection_name ,
83+ schema = schema ,
84+ index_params = index_params
85+ )
86+
87+ data = [
88+ {"dense" : vectors [idx ], "text" : doc }
89+ for idx , doc in enumerate (text_contents )
90+ ]
91+
92+ # Insert data
93+ res = client .insert (
94+ collection_name = collection_name ,
95+ data = data
96+ )
97+
98+ print (f"生成 { len (vectors )} 个向量,维度:{ len (vectors [0 ])} " )
99+
100+ # 同样,在处理中文文档时,Milvus 2.5版本也支持指定相应的中文分析器。
101+ # # 定义分词器参数
102+ # analyzer_params = {
103+ # "type": "chinese" # 指定分词器类型为中文
104+ # }
105+ #
106+ # # 添加文本字段到 Schema,并启用分词器
107+ # schema.add_field(
108+ # field_name="text", # 字段名称
109+ # datatype=DataType.VARCHAR, # 数据类型:字符串(VARCHAR)
110+ # max_length=65535, # 最大长度:65535 字符
111+ # enable_analyzer=True, # 启用分词器
112+ # analyzer_params=analyzer_params # 分词器参数
113+ # )
114+
115+ # 步骤三:全文检索
116+ from pymilvus import MilvusClient
117+
118+ # 创建Milvus Client。
119+ client = MilvusClient (
120+ uri = "http://c-xxxx.milvus.aliyuncs.com:19530" , # Milvus实例的公网地址。
121+ token = "<yourUsername>:<yourPassword>" , # 登录Milvus实例的用户名和密码。
122+ db_name = "default" # 待连接的数据库名称,本文示例为默认的default。
123+ )
124+
125+ search_params = {
126+ 'params' : {'drop_ratio_search' : 0.2 },
127+ }
128+
129+ full_text_search_res = client .search (
130+ collection_name = 'milvus_overview' ,
131+ data = ['what makes milvus so fast?' ],
132+ anns_field = 'sparse_bm25' ,
133+ limit = 3 ,
134+ search_params = search_params ,
135+ output_fields = ["text" ],
136+ )
137+
138+ for hits in full_text_search_res :
139+ for hit in hits :
140+ print (hit )
141+ print ("\n " )
142+
143+ # 步骤四:关键词匹配
144+ # filter = "TEXT_MATCH(text, 'query') and TEXT_MATCH(text, 'node')"
145+ #
146+ # text_match_res = client.search(
147+ # collection_name="milvus_overview",
148+ # anns_field="dense",
149+ # data=query_embeddings,
150+ # filter=filter,
151+ # search_params={"params": {"nprobe": 10}},
152+ # limit=2,
153+ # output_fields=["text"]
154+ # )
155+
156+ # 步骤五:混合检索与RAG
157+ from pymilvus import MilvusClient
158+ from pymilvus import AnnSearchRequest , RRFRanker
159+ from langchain_community .embeddings import DashScopeEmbeddings
160+ from dashscope import Generation
161+
162+ # 创建Milvus Client。
163+ client = MilvusClient (
164+ uri = "http://c-xxxx.milvus.aliyuncs.com:19530" , # Milvus实例的公网地址。
165+ token = "<yourUsername>:<yourPassword>" , # 登录Milvus实例的用户名和密码。
166+ db_name = "default" # 待连接的数据库名称,本文示例为默认的default。
167+ )
168+
169+ collection_name = "milvus_overview"
170+
171+ # 替换为您的 DashScope API-KEY
172+ dashscope_api_key = "<YOUR_DASHSCOPE_API_KEY>"
173+
174+ # 初始化 Embedding 模型
175+ embeddings = DashScopeEmbeddings (
176+ model = "text-embedding-v2" , # 使用text-embedding-v2模型。
177+ dashscope_api_key = dashscope_api_key
178+ )
179+
180+ # Define the query
181+ query = "Why does Milvus run so scalable?"
182+
183+ # Embed the query and generate the corresponding vector representation
184+ query_embeddings = embeddings .embed_documents ([query ])
185+
186+ # Set the top K result count
187+ top_k = 5 # Get the top 5 docs related to the query
188+
189+ # Define the parameters for the dense vector search
190+ search_params_dense = {
191+ "metric_type" : "IP" ,
192+ "params" : {"nprobe" : 2 }
193+ }
194+
195+ # Create a dense vector search request
196+ request_dense = AnnSearchRequest ([query_embeddings [0 ]], "dense" , search_params_dense , limit = top_k )
197+
198+ # Define the parameters for the BM25 text search
199+ search_params_bm25 = {
200+ "metric_type" : "BM25"
201+ }
202+
203+ # Create a BM25 text search request
204+ request_bm25 = AnnSearchRequest ([query ], "sparse_bm25" , search_params_bm25 , limit = top_k )
205+
206+ # Combine the two requests
207+ reqs = [request_dense , request_bm25 ]
208+
209+ # Initialize the RRF ranking algorithm
210+ ranker = RRFRanker (100 )
211+
212+ # Perform the hybrid search
213+ hybrid_search_res = client .hybrid_search (
214+ collection_name = collection_name ,
215+ reqs = reqs ,
216+ ranker = ranker ,
217+ limit = top_k ,
218+ output_fields = ["text" ]
219+ )
220+
221+ # Extract the context from hybrid search results
222+ context = []
223+ print ("Top K Results:" )
224+ for hits in hybrid_search_res : # Use the correct variable here
225+ for hit in hits :
226+ context .append (hit ['entity' ]['text' ]) # Extract text content to the context list
227+ print (hit ['entity' ]['text' ]) # Output each retrieved document
228+
229+
230+ # Define a function to get an answer based on the query and context
231+ def getAnswer (query , context ):
232+ prompt = f'''Please answer my question based on the content within:
233+ ```
234+ { context }
235+ ```
236+ My question is: { query } .
237+ '''
238+ # Call the generation module to get an answer
239+ rsp = Generation .call (model = 'qwen-turbo' , prompt = prompt )
240+ return rsp .output .text
241+
242+ # Get the answer
243+ answer = getAnswer (query , context )
244+
245+ print (answer )
246+
247+
248+ # Expected output excerpt
249+ """
250+ Milvus is highly scalable due to its cloud-native and highly decoupled system architecture. This architecture allows the system to continuously expand as data grows. Additionally, Milvus supports three deployment modes that cover a wide...
251+ """
0 commit comments