From cf4fdb6fb296de530bc05cd2f2c04cb03bad0021 Mon Sep 17 00:00:00 2001 From: Ian Griffin Date: Thu, 6 Nov 2025 22:46:12 +0800 Subject: [PATCH] some updates --- ragger.py | 88 ++++++++++++++++++++++++++++-------------------- requirements.txt | 8 +++-- todo.txt | 13 ++++--- 3 files changed, 66 insertions(+), 43 deletions(-) diff --git a/ragger.py b/ragger.py index 823ce77..701be61 100755 --- a/ragger.py +++ b/ragger.py @@ -9,6 +9,7 @@ import os import mimetypes import re +import readline from argparse import ArgumentParser from langchain import hub @@ -16,7 +17,8 @@ from langchain.chains import create_history_aware_retriever, create_retrieval_ch from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.retrievers.multi_query import MultiQueryRetriever from langchain_community.chat_message_histories import SQLChatMessageHistory -from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader +from langchain_community.document_loaders import TextLoader, WebBaseLoader #, PyPDFLoader +from langchain_pymupdf4llm import PyMuPDF4LLMLoader from langchain_core import vectorstores from langchain_core.documents import Document from langchain_core.messages import AIMessage, BaseMessage, HumanMessage @@ -25,7 +27,8 @@ from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from langchain_core.runnables.history import RunnableWithMessageHistory from langchain_core.vectorstores import InMemoryVectorStore -from langchain_openai import OpenAIEmbeddings, ChatOpenAI +# from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from langchain_ollama import OllamaEmbeddings, ChatOllama from langchain_text_splitters import RecursiveCharacterTextSplitter from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import START, StateGraph @@ -38,20 +41,29 @@ from urllib.parse import urlparse from termcolor import colored def main(): + # + # Readline settings + # + readline.parse_and_bind('set editing-mode vi') # # Parse Arguments # parser = ArgumentParser() parser.add_argument("-v", help="increase output verbosity", action="store_true") - parser.add_argument("-m", type=str, help="select OpenAI model to use", default="gpt-3.5-turbo") + parser.add_argument( + "-m", + type=str, + help="select language model to use", + default="gpt-oss" + ) args, paths = parser.parse_known_args() # # load LLM # - llm = ChatOpenAI(model=args.m) - if args.v: - print(">>> Loaded LLM: %s" % llm, file=stderr) + # llm = ChatOpenAI(model=args.m) + llm = ChatOllama(model=args.m) + if args.v: print(">>> Loaded LLM: %s" % llm, file=stderr) # # load documents @@ -59,7 +71,8 @@ def main(): loaders = { "text": lambda file: TextLoader(file).load(), - "application/pdf": lambda file: PyPDFLoader(file).load(), + "application/pdf": lambda file: PyMuPDF4LLMLoader(file).load(), + # "application/pdf": lambda file: PyPDFLoader(file).load(), "url": lambda file: WebBaseLoader(file).load(), } @@ -69,8 +82,7 @@ def main(): for path in paths: # check if url: if urlparse(path).scheme in ("http", "https"): - if args.v: - print(">>> Loading %s as %s" % (path, "url"), file=stderr) + if args.v: print(">>> Loading %s as %s" % (path, "url"), file=stderr) docs.extend(loaders["url"](path)) # check if file exists: @@ -86,25 +98,29 @@ def main(): if mimetype not in loaders: raise ValueError("Unsupported file type: %s" % mimetype) else: - if args.v: - print(">>> Loading %s as %s" % (path, mimetype), file=stderr) + if args.v: print(">>> Loading %s as %s" % (path, mimetype), file=stderr) docs.extend(loaders[mimetype](path)) - splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs) - if args.v: - print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr) + splits = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200 + ).split_documents(docs) + if args.v: print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr) # vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai)) - vectorstore = InMemoryVectorStore(embedding=OpenAIEmbeddings()) + vectorstore = InMemoryVectorStore( + embedding=OllamaEmbeddings(model='nomic-embed-text') + ) vectorstore.add_documents(splits) - if args.v: - print(">>> Vectorized %d chunks" % len(splits), file=stderr) - + if args.v: print(">>> Vectorized %d chunks" % len(splits), file=stderr) + simple_retriever = vectorstore.as_retriever() - retriever = MultiQueryRetriever.from_llm(retriever=simple_retriever, llm=llm) - if args.v: - print(">>> Created retriever", file=stderr) + retriever = MultiQueryRetriever.from_llm( + retriever=simple_retriever, + llm=llm + ) + if args.v: print(">>> Created retriever", file=stderr) # # History Prompt @@ -130,8 +146,7 @@ def main(): llm, retriever, contextualize_q_prompt ) - if args.v: - print(">>> Created history-aware retriever", file=stderr) + if args.v: print(">>> Created history-aware retriever", file=stderr) # # Prompt @@ -153,10 +168,11 @@ def main(): ] ) question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) - rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) - - if args.v: - print(">>> Created RAG chain", file=stderr) + rag_chain = create_retrieval_chain( + history_aware_retriever, + question_answer_chain + ) + if args.v: print(">>> Created RAG chain", file=stderr) # # Memory @@ -186,8 +202,8 @@ def main(): memory = MemorySaver() app = workflow.compile(checkpointer=memory) - if args.v: - print(">>> Created app memory\n", file=stderr) + if args.v: print(">>> Created app memory\n", file=stderr) + # # Chat # @@ -195,12 +211,12 @@ def main(): while True: try: - question = input(colored("Q: ", "yellow", attrs=["reverse"])) + question = input(colored("Q:", "yellow", attrs=["reverse"]) + " ") except EOFError: print() break - print(colored("A: ", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question}, + print(colored("A:", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question}, config=config)["answer"]), end="\n\n") # We define a dict representing the state of the application. @@ -228,25 +244,25 @@ def parse_markdown(text): # Check for headers if line.startswith("# "): level = len(line) - len(line.lstrip("#")) - header_text = line.lstrip("#").strip() + header_text = line.strip() #.lstrip("#").strip() formatted_text += colored(header_text, "blue", attrs=["bold", "underline"]) + "\n" continue if line.startswith("## "): level = len(line) - len(line.lstrip("#")) - header_text = line.lstrip("#").strip() + header_text = line.strip() #.lstrip("#").strip() formatted_text += colored(header_text, "blue", attrs=["bold"]) + "\n" continue if line.startswith("### "): level = len(line) - len(line.lstrip("#")) - header_text = line.lstrip("#").strip() + header_text = line.strip() #.lstrip("#").strip() formatted_text += colored(header_text, "cyan", attrs=["bold"]) + "\n" continue # Check for blockquotes if line.startswith(">"): - quote_text = line.lstrip(">").strip() + quote_text = line.strip() #.lstrip(">").strip() formatted_text += colored(quote_text, "yellow") + "\n" continue @@ -276,4 +292,4 @@ def parse_markdown(text): return formatted_text if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/requirements.txt b/requirements.txt index 036d68c..94be586 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,11 @@ gradio huggingface_hub langchain langchain-community -langchain-openai +# langchain-openai +langchain-ollama +langchain-pymupdf4llm langgraph openai -pypdf==5.0.1 +# pypdf==5.0.1 termcolor -tiktoken \ No newline at end of file +tiktoken diff --git a/todo.txt b/todo.txt index 79f906e..860ce4c 100644 --- a/todo.txt +++ b/todo.txt @@ -1,7 +1,12 @@ -async document loading +save conversation +editable input toggleable rich text -initial question argument -no looping argument +async document loading +recursive directory reading +skip files @argument +proper markdown rendering +initial question @argument +no looping @argument better code structure huggingface models availability UI @@ -10,4 +15,4 @@ x setup arguments x multi-query retriever x add verbose argument x add model argument -x multi pdf \ No newline at end of file +x multi pdf