Giter Club home page Giter Club logo

Comments (1)

Daethyra avatar Daethyra commented on June 11, 2024

Maybe?

import os
import glob
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI as OpenAILLM
from langchain.chains.question_answering import load_qa_chain

def setup_logging():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def custom_retry(max_retries=3, retry_exceptions=(Exception,), initial_delay=1, backoff_factor=2):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts, delay = 0, initial_delay
            while attempts < max_retries:
                try:
                    return func(*args, **kwargs)
                except retry_exceptions as e:
                    attempts += 1
                    next_retry = datetime.now() + timedelta(seconds=delay)
                    logging.warning(f"Retry attempt {attempts} for {func.__name__} due to {e}. Next retry at {next_retry}.")
                    if attempts == max_retries:
                        raise
                    time.sleep(delay)
                    delay *= backoff_factor
        return wrapper
    return decorator

class PDFProcessor:
    def __init__(self):
        self._load_env_vars()
        self._initialize_reusable_objects()

    @custom_retry(max_retries=3, retry_exceptions=(ValueError, FileNotFoundError))
    def _load_env_vars(self):
        load_dotenv()
        self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not self.OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY is missing. Please set the environment variable.")

    def _initialize_reusable_objects(self):
        self.embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY)
        self.llm = OpenAILLM(temperature=0.25, openai_api_key=self.OPENAI_API_KEY)

    @staticmethod
    def get_user_query(prompt="Please enter your query: "):
        query = input(prompt)
        if not query:
            raise ValueError("Query should not be empty.")
        return query

    @custom_retry(max_retries=3, retry_exceptions=(FileNotFoundError,))
    def load_pdfs_from_directory(self, directory_path="data/"):
        if not os.path.exists(directory_path):
            raise FileNotFoundError(f"The directory {directory_path} does not exist.")
        pdf_files = glob.glob(f"{directory_path}/*.pdf")
        if not pdf_files:
            raise FileNotFoundError(f"No PDF files found in the directory {directory_path}.")

        with ThreadPoolExecutor() as executor:
            all_texts = list(executor.map(self._load_and_split_document, pdf_files))
        return [chunk for chunks in all_texts for chunk in chunks]

    def _load_and_split_document(self, file_path, chunk_size=2000, chunk_overlap=0):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
        loader = PyPDFLoader(file_path)
        data = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        return text_splitter.split_documents(data)

    def perform_similarity_search(self, docsearch, query):
        if not query:
            raise ValueError("Query should not be empty.")
        return docsearch.similarity_search(query)

if __name__ == "__main__":
    try:
        setup_logging()
        pdf_processor = PDFProcessor()
        texts = pdf_processor.load_pdfs_from_directory()
        num_docs = len(texts)
        logging.info(f"Loaded {num_docs} document(s).")
        docsearch = Chroma.from_documents(texts, pdf_processor.embeddings)
        chain = load_qa_chain(pdf_processor.llm, chain_type="stuff")
        query = pdf_processor.get_user_query()
        result = pdf_processor.perform_similarity_search(docsearch, query)
        for r in result:
            logging.info(chain.run(input_documents=r, question=query))
    except Exception as e:
        logging.error(f"An error occurred: {e}")

from build-ragai.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.