42 KiB
This code integrates LangChain library functionalities to process and query PDF documents using OpenAI's language model. It loads a PDF file, splits it into pages, and stores these pages in a vector database (ChromaDB). It then creates a toolkit to interact with the vector store and uses an agent executor to query the database based on user input. This is useful for extracting and querying information from structured documents like financial reports.
import os
from langchain.llms import OpenAI from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.agents.agent_toolkits import ( create_vectorstore_agent, VectorStoreToolkit, VectorStoreInfo, )
Set the OpenAI API key for authentication
os.environ["OPENAI_API_KEY"] = """
Create an instance of the OpenAI language model with specified parameters
import os from openai import OpenAI client = OpenAI( # This is the default and can be omitted api_key="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", ) chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": "Say this is a test", } ], model="gpt-3.5-turbo", )
--------------------------------------------------------------------------- AuthenticationError Traceback (most recent call last) Cell In[20], line 9 2 from openai import OpenAI 4 client = OpenAI( 5 # This is the default and can be omitted 6 api_key="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", 7 ) ----> 9 chat_completion = client.chat.completions.create( 10 messages=[ 11 { 12 "role": "user", 13 "content": "Say this is a test", 14 } 15 ], 16 model="gpt-3.5-turbo", 17 ) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/openai/_utils/_utils.py:274, in required_args.<locals>.inner.<locals>.wrapper(*args, **kwargs) 272 msg = f"Missing required argument: {quote(missing[0])}" 273 raise TypeError(msg) --> 274 return func(*args, **kwargs) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/openai/resources/chat/completions.py:815, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, presence_penalty, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout) 775 @required_args(["messages", "model"], ["messages", "model", "stream"]) 776 def create( 777 self, (...) 812 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, 813 ) -> ChatCompletion | Stream[ChatCompletionChunk]: 814 validate_response_format(response_format) --> 815 return self._post( 816 "/chat/completions", 817 body=maybe_transform( 818 { 819 "messages": messages, 820 "model": model, 821 "audio": audio, 822 "frequency_penalty": frequency_penalty, 823 "function_call": function_call, 824 "functions": functions, 825 "logit_bias": logit_bias, 826 "logprobs": logprobs, 827 "max_completion_tokens": max_completion_tokens, 828 "max_tokens": max_tokens, 829 "metadata": metadata, 830 "modalities": modalities, 831 "n": n, 832 "parallel_tool_calls": parallel_tool_calls, 833 "presence_penalty": presence_penalty, 834 "response_format": response_format, 835 "seed": seed, 836 "service_tier": service_tier, 837 "stop": stop, 838 "store": store, 839 "stream": stream, 840 "stream_options": stream_options, 841 "temperature": temperature, 842 "tool_choice": tool_choice, 843 "tools": tools, 844 "top_logprobs": top_logprobs, 845 "top_p": top_p, 846 "user": user, 847 }, 848 completion_create_params.CompletionCreateParams, 849 ), 850 options=make_request_options( 851 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout 852 ), 853 cast_to=ChatCompletion, 854 stream=stream or False, 855 stream_cls=Stream[ChatCompletionChunk], 856 ) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/openai/_base_client.py:1277, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls) 1263 def post( 1264 self, 1265 path: str, (...) 1272 stream_cls: type[_StreamT] | None = None, 1273 ) -> ResponseT | _StreamT: 1274 opts = FinalRequestOptions.construct( 1275 method="post", url=path, json_data=body, files=to_httpx_files(files), **options 1276 ) -> 1277 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/openai/_base_client.py:954, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls) 951 else: 952 retries_taken = 0 --> 954 return self._request( 955 cast_to=cast_to, 956 options=options, 957 stream=stream, 958 stream_cls=stream_cls, 959 retries_taken=retries_taken, 960 ) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/openai/_base_client.py:1058, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls) 1055 err.response.read() 1057 log.debug("Re-raising status error") -> 1058 raise self._make_status_error_from_response(err.response) from None 1060 return self._process_response( 1061 cast_to=cast_to, 1062 options=options, (...) 1066 retries_taken=retries_taken, 1067 ) AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-nzJ5u***************************************N1s3. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
llm = OpenAI(temperature=0.1, verbose=True)
Initialize a PDF loader for the specified file
loader = PyPDFLoader("apple.pdf")
Split the PDF into individual pages for processing
pages = loader.load_and_split()
Load the split pages into a Chroma vector database for efficient querying
store = Chroma.from_documents(pages, collection_name="annualreport")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[14], line 1 ----> 1 store = Chroma.from_documents(pages, collection_name="annualreport") File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:878, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs) 876 texts = [doc.page_content for doc in documents] 877 metadatas = [doc.metadata for doc in documents] --> 878 return cls.from_texts( 879 texts=texts, 880 embedding=embedding, 881 metadatas=metadatas, 882 ids=ids, 883 collection_name=collection_name, 884 persist_directory=persist_directory, 885 client_settings=client_settings, 886 client=client, 887 collection_metadata=collection_metadata, 888 **kwargs, 889 ) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:842, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs) 836 chroma_collection.add_texts( 837 texts=batch[3] if batch[3] else [], 838 metadatas=batch[2] if batch[2] else None, 839 ids=batch[0], 840 ) 841 else: --> 842 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids) 843 return chroma_collection File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:313, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs) 311 raise ValueError(e.args[0] + "\n\n" + msg) 312 else: --> 313 raise e 314 if empty_ids: 315 texts_without_metadatas = [texts[j] for j in empty_ids] File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:299, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs) 297 ids_with_metadata = [ids[idx] for idx in non_empty_ids] 298 try: --> 299 self._collection.upsert( 300 metadatas=metadatas, 301 embeddings=embeddings_with_metadatas, 302 documents=texts_with_metadatas, 303 ids=ids_with_metadata, 304 ) 305 except ValueError as e: 306 if "Expected metadata value to be" in str(e): File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/chromadb/api/models/Collection.py:298, in Collection.upsert(self, ids, embeddings, metadatas, documents, images, uris) 267 def upsert( 268 self, 269 ids: OneOrMany[ID], (...) 279 uris: Optional[OneOrMany[URI]] = None, 280 ) -> None: 281 """Update the embeddings, metadatas or documents for provided ids, or create them if they don't exist. 282 283 Args: (...) 290 None 291 """ 292 ( 293 ids, 294 embeddings, 295 metadatas, 296 documents, 297 uris, --> 298 ) = self._validate_and_prepare_upsert_request( 299 ids, embeddings, metadatas, documents, images, uris 300 ) 302 self._client._upsert( 303 collection_id=self.id, 304 ids=ids, (...) 308 uris=uris, 309 ) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/chromadb/api/models/CollectionCommon.py:546, in CollectionCommon._validate_and_prepare_upsert_request(self, ids, embeddings, metadatas, documents, images, uris) 544 if embeddings is None: 545 if documents is not None: --> 546 embeddings = self._embed(input=documents) 547 else: 548 embeddings = self._embed(input=images) File ~/Documents/Development/python/strategy-lab1/.venv/lib/python3.10/site-packages/chromadb/api/models/CollectionCommon.py:577, in CollectionCommon._embed(self, input) 575 def _embed(self, input: Any) -> Embeddings: 576 if self._embedding_function is None: --> 577 raise ValueError( 578 "You must provide an embedding function to compute embeddings." 579 "https://docs.trychroma.com/guides/embeddings" 580 ) 581 return self._embedding_function(input=input) ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/guides/embeddings
Create a VectorStoreInfo object to hold metadata about the vector store
vectorstore_info = VectorStoreInfo( name="apple", description="Apple quarterly consolidated financials", vectorstore=store, )
Convert the vector store information into a toolkit for LangChain
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
Create an agent executor that uses the language model and toolkit for querying
agent_executor = create_vectorstore_agent(llm=llm, toolkit=toolkit, verbose=True)
Prompt the user to enter a search term for querying the document
prompt = input("Enter your search term: ")
if prompt: # Pass the user input to the agent executor for processing response = agent_executor.run(prompt) # Print the response from the language model to the screen print(response)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[13], line 5 1 if prompt: 2 3 # Pass the user input to the agent executor for processing ----> 5 response = agent_executor.run(prompt) 7 # Print the response from the language model to the screen 9 print(response) NameError: name 'agent_executor' is not defined
PyQuant News is where finance practitioners level up with Python for quant finance, algorithmic trading, and market data analysis. Looking to get started? Check out the fastest growing, top-selling course to get started with Python for quant finance. For educational purposes. Not investment advise. Use at your own risk.
