initial commit

2024-10-28 10:46:26 +08:00
commit 55b5bc7081
6 changed files with 332 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+old.py
+__pycache__
--- a/README.md
+++ b/README.md
@@ -0,0 +1,23 @@
+Ragger
+======
+
+## Description
+```ragger.py``` is a command line tool to RAG over multiple pieces text, PDFs and websites using langchain and OpenAI.
+
+## Prerequisites
+Since this codebase uses OpenAI models, a OpenAI API key is needed.
+After obtaining an API key, set the environment variable OPENAI_API_KEY to the API key.
+
+## Installation
+This program can be installed by cloning the repository and installing the package using pip:
+```bash
+git clone https://github.com/your_username/your_repository.git
+cd ragger
+pip install .
+```
+
+## Usage
+To use the `ragger` command, open a terminal and run:
+```bash
+ragger file_1_path.pdf file_2_path.txt https://url_1_path.net
+```
--- a/ragger.py
+++ b/ragger.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+
+import os
+import mimetypes
+import re
+
+from argparse import ArgumentParser
+from langchain import hub
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.retrievers.multi_query import MultiQueryRetriever
+from langchain_community.chat_message_histories import SQLChatMessageHistory
+from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
+from langchain_core import vectorstores
+from langchain_core.documents import Document
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.graph import START, StateGraph
+from langgraph.graph.message import add_messages
+from sys import stderr
+from termcolor import colored
+from typing import Sequence
+from typing_extensions import Annotated, TypedDict
+from urllib.parse import urlparse
+from termcolor import colored
+
+def parse_markdown(text):
+    lines = text.splitlines()
+    formatted_text = ""
+    in_code_block = False
+
+    for line in lines:
+        # Check for code blocks
+        if line.startswith("```"):
+            in_code_block = not in_code_block
+            continue  # Skip the line with ```
+        elif in_code_block:
+            formatted_text += colored(line + "\n", "green")
+            continue
+
+        # Check for headers
+        if line.startswith("#"):
+            level = len(line) - len(line.lstrip("#"))
+            header_text = line.lstrip("#").strip()
+            formatted_text += colored(header_text, "blue", attrs=["bold"]) + "\n"
+            continue
+
+        # Check for blockquotes
+        if line.startswith(">"):
+            quote_text = line.lstrip(">").strip()
+            formatted_text += colored(quote_text, "yellow") + "\n"
+            continue
+
+        # Check for tables (rows separated by "|")
+        if "|" in line:
+            table_row = "\t".join(line.split("|")).strip()
+            formatted_text += table_row + "\n"
+            continue
+
+        # Inline formatting for bold, italic, and code
+        # Bold (**text** or __text__)
+        line = re.sub(r"\*\*(.*?)\*\*|__(.*?)__", lambda m: colored(m.group(1) or m.group(2), attrs=["bold"]), line)
+        # Italic (*text* or _text_)
+        line = re.sub(r"\*(.*?)\*|_(.*?)_", lambda m: colored(m.group(1) or m.group(2), attrs=["underline"]), line)
+        # Inline code (`code`)
+        line = re.sub(r"`(.*?)`", lambda m: colored(m.group(1), "green"), line)
+
+        # List items (bullets and numbers)
+        # Bulleted list
+        line = re.sub(r"^(\s*[-*])\s", lambda m: colored(m.group(1), "cyan") + " ", line)
+        # Numbered list
+        line = re.sub(r"^(\s*\d+\.)\s", lambda m: colored(m.group(1), "cyan") + " ", line)
+
+        # Add processed line to formatted text
+        formatted_text += line + "\n"
+
+    return formatted_text
+
+class State(TypedDict):
+    input: str
+    chat_history: Annotated[Sequence[BaseMessage], add_messages]
+    context: str
+    answer: str
+
+if __name__ == "__main__":
+    #
+    # Parse Arguments
+    #
+    parser = ArgumentParser()
+    parser.add_argument("-v", help="increase output verbosity", action="store_true")
+    parser.add_argument("-m", type=str, help="select OpenAI model to use", default="gpt-3.5-turbo")
+    args, paths = parser.parse_known_args()
+
+    #
+    # load LLM
+    #
+    llm = ChatOpenAI(model=args.m)
+    if args.v:
+        print(">>> Loaded LLM: %s" % llm, file=stderr)
+
+    #
+    # load documents
+    #
+    
+    loaders = {
+        "text/plain": lambda file: TextLoader(file).load(),
+        "application/pdf": lambda file: PyPDFLoader(file).load(),
+        "url": lambda file: WebBaseLoader(file).load(),
+    }
+
+#    docs = PyPDFLoader(paths[0]).load()
+    docs = []
+
+    for path in paths:
+        # check if url:
+        if urlparse(path).scheme in ("http", "https"):
+            if args.v:
+                print(">>> Loading %s as %s" % (path, "url"), file=stderr)
+            docs.extend(loaders["url"](path))
+
+        # check if file exists:
+        elif not os.path.exists(path):
+            raise FileNotFoundError("%s not found" %  path)
+        
+        # detect filetype
+        else:
+            mimetype, _ = mimetypes.guess_type(path)
+            if mimetype not in loaders:
+                raise ValueError("Unsupported file type: %s" % mimetype)
+            else:
+                if args.v:
+                    print(">>> Loading %s as %s" % (path, mimetype), file=stderr)
+                docs.extend(loaders[mimetype](path))
+
+    splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs)
+    if args.v:
+        print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr)
+
+    # vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai))
+
+    vectorstore = InMemoryVectorStore(embedding=OpenAIEmbeddings())
+    vectorstore.add_documents(splits)
+    if args.v:
+        print(">>> Vectorized %d chunks" % len(splits), file=stderr)
+            
+    simple_retriever = vectorstore.as_retriever()
+    retriever = MultiQueryRetriever.from_llm(retriever=simple_retriever, llm=llm)
+    if args.v:
+        print(">>> Created retriever", file=stderr)
+
+    #
+    # History Prompt
+    #
+    contextualize_q_system_prompt = (
+        "Given a chat history and the latest user question "
+        "which might reference context in the chat history, "
+        "formulate a standalone question which can be understood "
+        "without the chat history. Do NOT answer the question, "
+        "just reformulate it if needed and otherwise return it as is."
+    )
+
+    contextualize_q_prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", contextualize_q_system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    
+    
+    history_aware_retriever = create_history_aware_retriever(
+        llm, retriever, contextualize_q_prompt
+    )
+
+    if args.v:
+        print(">>> Created history-aware retriever", file=stderr)
+
+    #
+    # Prompt
+    #
+    system_prompt = (
+        "You are an assistant for question-answering tasks. "
+        "Use the following pieces of retrieved context to answer "
+        "the question. If you don't know the answer, say that you "
+        "don't know. Answer as detailed and easy to understand as possible."
+        "\n\n"
+        "{context}"
+    )
+        
+    qa_prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+
+    if args.v:
+        print(">>> Created RAG chain", file=stderr)
+
+    #
+    # Memory
+    #
+
+    # We then define a simple node that runs the `rag_chain`.
+    # The `return` values of the node update the graph state, so here we just
+    # update the chat history with the input message and response.
+    def call_model(state: State):
+        response = rag_chain.invoke(state)
+        return {
+            "chat_history": [
+                HumanMessage(state["input"]),
+                AIMessage(response["answer"]),
+            ],
+            "context": response["context"],
+            "answer": response["answer"],
+        }
+
+    # Our graph consists only of one node:
+    workflow = StateGraph(state_schema=State)
+    workflow.add_edge(START, "model")
+    workflow.add_node("model", call_model)
+
+    # Finally, we compile the graph with a checkpointer object.
+    # This persists the state, in this case in memory.
+    memory = MemorySaver()
+    app = workflow.compile(checkpointer=memory)
+
+    if args.v:
+        print(">>> Created app memory\n", file=stderr)
+    #
+    # Chat
+    #
+    config = {"configurable": {"thread_id": "abc123"}}
+
+    while True:
+        try:
+            question = input(colored("Q: ", "yellow", attrs=["reverse"]))
+        except EOFError:
+            print()
+            break
+
+        print(colored("A: ", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question},
+    config=config)["answer"]), end="\n\n")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+bs4
+gradio
+huggingface_hub
+langchain
+langchain-chroma
+langchain-community
+langchain-openai
+langgraph
+openai
+pypdf==5.0.1
+termcolor
+tiktoken
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,29 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='my_awesome_package',
+    version='0.1',
+    packages=find_packages(),
+    package_data={
+        'my_package': ['*.py', '!old.py']
+    },
+    entry_points={
+        'console_scripts': [
+            'ragger = my_package.ragger:main'
+        ],
+    },
+    install_requires=[
+        'bs4'
+        'gradio'
+        'huggingface_hub'
+        'langchain'
+        'langchain-chroma'
+        'langchain-community'
+        'langchain-openai'
+        'langgraph'
+        'openai'
+        'pypdf==5.0.1'
+        'termcolor'
+        'tiktoken'
+    ],
+)
--- a/todo.txt
+++ b/todo.txt
@@ -0,0 +1,13 @@
+x setup setup.py
+better code structure
+huggingface models availability
+initial question argument
+no looping argument
+toggle rich text
+UI
+x setup arguments
+x rich text
+x multi-query retriever
+x add verbose argument
+x add model argument
+x multi pdf