Comments (1)
Maybe?
import os
import glob
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI as OpenAILLM
from langchain.chains.question_answering import load_qa_chain
def setup_logging():
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def custom_retry(max_retries=3, retry_exceptions=(Exception,), initial_delay=1, backoff_factor=2):
def decorator(func):
def wrapper(*args, **kwargs):
attempts, delay = 0, initial_delay
while attempts < max_retries:
try:
return func(*args, **kwargs)
except retry_exceptions as e:
attempts += 1
next_retry = datetime.now() + timedelta(seconds=delay)
logging.warning(f"Retry attempt {attempts} for {func.__name__} due to {e}. Next retry at {next_retry}.")
if attempts == max_retries:
raise
time.sleep(delay)
delay *= backoff_factor
return wrapper
return decorator
class PDFProcessor:
def __init__(self):
self._load_env_vars()
self._initialize_reusable_objects()
@custom_retry(max_retries=3, retry_exceptions=(ValueError, FileNotFoundError))
def _load_env_vars(self):
load_dotenv()
self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not self.OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY is missing. Please set the environment variable.")
def _initialize_reusable_objects(self):
self.embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY)
self.llm = OpenAILLM(temperature=0.25, openai_api_key=self.OPENAI_API_KEY)
@staticmethod
def get_user_query(prompt="Please enter your query: "):
query = input(prompt)
if not query:
raise ValueError("Query should not be empty.")
return query
@custom_retry(max_retries=3, retry_exceptions=(FileNotFoundError,))
def load_pdfs_from_directory(self, directory_path="data/"):
if not os.path.exists(directory_path):
raise FileNotFoundError(f"The directory {directory_path} does not exist.")
pdf_files = glob.glob(f"{directory_path}/*.pdf")
if not pdf_files:
raise FileNotFoundError(f"No PDF files found in the directory {directory_path}.")
with ThreadPoolExecutor() as executor:
all_texts = list(executor.map(self._load_and_split_document, pdf_files))
return [chunk for chunks in all_texts for chunk in chunks]
def _load_and_split_document(self, file_path, chunk_size=2000, chunk_overlap=0):
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
loader = PyPDFLoader(file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return text_splitter.split_documents(data)
def perform_similarity_search(self, docsearch, query):
if not query:
raise ValueError("Query should not be empty.")
return docsearch.similarity_search(query)
if __name__ == "__main__":
try:
setup_logging()
pdf_processor = PDFProcessor()
texts = pdf_processor.load_pdfs_from_directory()
num_docs = len(texts)
logging.info(f"Loaded {num_docs} document(s).")
docsearch = Chroma.from_documents(texts, pdf_processor.embeddings)
chain = load_qa_chain(pdf_processor.llm, chain_type="stuff")
query = pdf_processor.get_user_query()
result = pdf_processor.perform_similarity_search(docsearch, query)
for r in result:
logging.info(chain.run(input_documents=r, question=query))
except Exception as e:
logging.error(f"An error occurred: {e}")
from build-ragai.
Related Issues (20)
- Double check both "prompt" documents for quality and clarity
- Double check "continued education" directory for useful learning
- flake8 results HOT 1
- Add descriptive Docstrings to all retained code HOT 1
- Jina embeddings + vector store module HOT 1
- Add notebook: privacy-rag-over-code HOT 1
- Add Notebook: rag-in-langchain
- Microphone Transcription Refactorization | App can't keep up, produces some inaccuracy | Upgrade to WhisperV3 + feat(consistent, predictable output production intervals)
- Create unittests | test the code in src/langchain/end2end/rag/pinecone
- Review and Rework src/langchain/notebooks code
- root dir README Changes: directly reference subdir src/langchain/end2end/
- analyze langchain/notebooks/** for outdated import statements, code, etc. HOT 4
- Create src/google directory for VertexAI related work HOT 1
- Lackluster Workflow HOT 1
- test streamlit module HOT 1
- README: Missing section for links to OpenAI and Transformers HOT 1
- Subdir: "codesnippets": unchecked HOT 1
- Ensure code works: "OpenAI" HOT 1
- Entire subdir: unchecked | requires meticulous review HOT 1
- Check my Gists for more useful notebooks
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from build-ragai.