From 55b5bc70817d125df2c9ee0698b806705c3574ae Mon Sep 17 00:00:00 2001 From: Ian Griffin Date: Mon, 28 Oct 2024 10:46:26 +0800 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 23 +++++ ragger.py | 253 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 12 +++ setup.py | 29 ++++++ todo.txt | 13 +++ 6 files changed, 332 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 ragger.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 todo.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2f44ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +old.py +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..28e1224 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +Ragger +====== + +## Description +```ragger.py``` is a command line tool to RAG over multiple pieces text, PDFs and websites using langchain and OpenAI. + +## Prerequisites +Since this codebase uses OpenAI models, a OpenAI API key is needed. +After obtaining an API key, set the environment variable OPENAI_API_KEY to the API key. + +## Installation +This program can be installed by cloning the repository and installing the package using pip: +```bash +git clone https://github.com/your_username/your_repository.git +cd ragger +pip install . +``` + +## Usage +To use the `ragger` command, open a terminal and run: +```bash +ragger file_1_path.pdf file_2_path.txt https://url_1_path.net +``` \ No newline at end of file diff --git a/ragger.py b/ragger.py new file mode 100755 index 0000000..3301216 --- /dev/null +++ b/ragger.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 + +import os +import mimetypes +import re + +from argparse import ArgumentParser +from langchain import hub +from langchain.chains import create_history_aware_retriever, create_retrieval_chain +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain.retrievers.multi_query import MultiQueryRetriever +from langchain_community.chat_message_histories import SQLChatMessageHistory +from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader +from langchain_core import vectorstores +from langchain_core.documents import Document +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate +from langchain_core.runnables import RunnablePassthrough +from langchain_core.runnables.history import RunnableWithMessageHistory +from langchain_core.vectorstores import InMemoryVectorStore +from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langgraph.checkpoint.memory import MemorySaver +from langgraph.graph import START, StateGraph +from langgraph.graph.message import add_messages +from sys import stderr +from termcolor import colored +from typing import Sequence +from typing_extensions import Annotated, TypedDict +from urllib.parse import urlparse +from termcolor import colored + +def parse_markdown(text): + lines = text.splitlines() + formatted_text = "" + in_code_block = False + + for line in lines: + # Check for code blocks + if line.startswith("```"): + in_code_block = not in_code_block + continue # Skip the line with ``` + elif in_code_block: + formatted_text += colored(line + "\n", "green") + continue + + # Check for headers + if line.startswith("#"): + level = len(line) - len(line.lstrip("#")) + header_text = line.lstrip("#").strip() + formatted_text += colored(header_text, "blue", attrs=["bold"]) + "\n" + continue + + # Check for blockquotes + if line.startswith(">"): + quote_text = line.lstrip(">").strip() + formatted_text += colored(quote_text, "yellow") + "\n" + continue + + # Check for tables (rows separated by "|") + if "|" in line: + table_row = "\t".join(line.split("|")).strip() + formatted_text += table_row + "\n" + continue + + # Inline formatting for bold, italic, and code + # Bold (**text** or __text__) + line = re.sub(r"\*\*(.*?)\*\*|__(.*?)__", lambda m: colored(m.group(1) or m.group(2), attrs=["bold"]), line) + # Italic (*text* or _text_) + line = re.sub(r"\*(.*?)\*|_(.*?)_", lambda m: colored(m.group(1) or m.group(2), attrs=["underline"]), line) + # Inline code (`code`) + line = re.sub(r"`(.*?)`", lambda m: colored(m.group(1), "green"), line) + + # List items (bullets and numbers) + # Bulleted list + line = re.sub(r"^(\s*[-*])\s", lambda m: colored(m.group(1), "cyan") + " ", line) + # Numbered list + line = re.sub(r"^(\s*\d+\.)\s", lambda m: colored(m.group(1), "cyan") + " ", line) + + # Add processed line to formatted text + formatted_text += line + "\n" + + return formatted_text + +class State(TypedDict): + input: str + chat_history: Annotated[Sequence[BaseMessage], add_messages] + context: str + answer: str + +if __name__ == "__main__": + # + # Parse Arguments + # + parser = ArgumentParser() + parser.add_argument("-v", help="increase output verbosity", action="store_true") + parser.add_argument("-m", type=str, help="select OpenAI model to use", default="gpt-3.5-turbo") + args, paths = parser.parse_known_args() + + # + # load LLM + # + llm = ChatOpenAI(model=args.m) + if args.v: + print(">>> Loaded LLM: %s" % llm, file=stderr) + + # + # load documents + # + + loaders = { + "text/plain": lambda file: TextLoader(file).load(), + "application/pdf": lambda file: PyPDFLoader(file).load(), + "url": lambda file: WebBaseLoader(file).load(), + } + +# docs = PyPDFLoader(paths[0]).load() + docs = [] + + for path in paths: + # check if url: + if urlparse(path).scheme in ("http", "https"): + if args.v: + print(">>> Loading %s as %s" % (path, "url"), file=stderr) + docs.extend(loaders["url"](path)) + + # check if file exists: + elif not os.path.exists(path): + raise FileNotFoundError("%s not found" % path) + + # detect filetype + else: + mimetype, _ = mimetypes.guess_type(path) + if mimetype not in loaders: + raise ValueError("Unsupported file type: %s" % mimetype) + else: + if args.v: + print(">>> Loading %s as %s" % (path, mimetype), file=stderr) + docs.extend(loaders[mimetype](path)) + + splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs) + if args.v: + print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr) + + # vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai)) + + vectorstore = InMemoryVectorStore(embedding=OpenAIEmbeddings()) + vectorstore.add_documents(splits) + if args.v: + print(">>> Vectorized %d chunks" % len(splits), file=stderr) + + simple_retriever = vectorstore.as_retriever() + retriever = MultiQueryRetriever.from_llm(retriever=simple_retriever, llm=llm) + if args.v: + print(">>> Created retriever", file=stderr) + + # + # History Prompt + # + contextualize_q_system_prompt = ( + "Given a chat history and the latest user question " + "which might reference context in the chat history, " + "formulate a standalone question which can be understood " + "without the chat history. Do NOT answer the question, " + "just reformulate it if needed and otherwise return it as is." + ) + + contextualize_q_prompt = ChatPromptTemplate.from_messages( + [ + ("system", contextualize_q_system_prompt), + MessagesPlaceholder("chat_history"), + ("human", "{input}"), + ] + ) + + + history_aware_retriever = create_history_aware_retriever( + llm, retriever, contextualize_q_prompt + ) + + if args.v: + print(">>> Created history-aware retriever", file=stderr) + + # + # Prompt + # + system_prompt = ( + "You are an assistant for question-answering tasks. " + "Use the following pieces of retrieved context to answer " + "the question. If you don't know the answer, say that you " + "don't know. Answer as detailed and easy to understand as possible." + "\n\n" + "{context}" + ) + + qa_prompt = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + MessagesPlaceholder("chat_history"), + ("human", "{input}"), + ] + ) + question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) + rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) + + if args.v: + print(">>> Created RAG chain", file=stderr) + + # + # Memory + # + + # We then define a simple node that runs the `rag_chain`. + # The `return` values of the node update the graph state, so here we just + # update the chat history with the input message and response. + def call_model(state: State): + response = rag_chain.invoke(state) + return { + "chat_history": [ + HumanMessage(state["input"]), + AIMessage(response["answer"]), + ], + "context": response["context"], + "answer": response["answer"], + } + + # Our graph consists only of one node: + workflow = StateGraph(state_schema=State) + workflow.add_edge(START, "model") + workflow.add_node("model", call_model) + + # Finally, we compile the graph with a checkpointer object. + # This persists the state, in this case in memory. + memory = MemorySaver() + app = workflow.compile(checkpointer=memory) + + if args.v: + print(">>> Created app memory\n", file=stderr) + # + # Chat + # + config = {"configurable": {"thread_id": "abc123"}} + + while True: + try: + question = input(colored("Q: ", "yellow", attrs=["reverse"])) + except EOFError: + print() + break + + print(colored("A: ", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question}, + config=config)["answer"]), end="\n\n") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d1b593d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +bs4 +gradio +huggingface_hub +langchain +langchain-chroma +langchain-community +langchain-openai +langgraph +openai +pypdf==5.0.1 +termcolor +tiktoken \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..cc14bcb --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +setup( + name='my_awesome_package', + version='0.1', + packages=find_packages(), + package_data={ + 'my_package': ['*.py', '!old.py'] + }, + entry_points={ + 'console_scripts': [ + 'ragger = my_package.ragger:main' + ], + }, + install_requires=[ + 'bs4' + 'gradio' + 'huggingface_hub' + 'langchain' + 'langchain-chroma' + 'langchain-community' + 'langchain-openai' + 'langgraph' + 'openai' + 'pypdf==5.0.1' + 'termcolor' + 'tiktoken' + ], +) diff --git a/todo.txt b/todo.txt new file mode 100644 index 0000000..ee11431 --- /dev/null +++ b/todo.txt @@ -0,0 +1,13 @@ +x setup setup.py +better code structure +huggingface models availability +initial question argument +no looping argument +toggle rich text +UI +x setup arguments +x rich text +x multi-query retriever +x add verbose argument +x add model argument +x multi pdf \ No newline at end of file