some updates

This commit is contained in:
Ian Griffin 2025-11-06 22:46:12 +08:00
parent 51325bde69
commit cf4fdb6fb2
3 changed files with 66 additions and 43 deletions

View File

@ -9,6 +9,7 @@
import os
import mimetypes
import re
import readline
from argparse import ArgumentParser
from langchain import hub
@ -16,7 +17,8 @@ from langchain.chains import create_history_aware_retriever, create_retrieval_ch
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.chat_message_histories import SQLChatMessageHistory
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain_community.document_loaders import TextLoader, WebBaseLoader #, PyPDFLoader
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_core import vectorstores
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
@ -25,7 +27,8 @@ from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
@ -38,20 +41,29 @@ from urllib.parse import urlparse
from termcolor import colored
def main():
#
# Readline settings
#
readline.parse_and_bind('set editing-mode vi')
#
# Parse Arguments
#
parser = ArgumentParser()
parser.add_argument("-v", help="increase output verbosity", action="store_true")
parser.add_argument("-m", type=str, help="select OpenAI model to use", default="gpt-3.5-turbo")
parser.add_argument(
"-m",
type=str,
help="select language model to use",
default="gpt-oss"
)
args, paths = parser.parse_known_args()
#
# load LLM
#
llm = ChatOpenAI(model=args.m)
if args.v:
print(">>> Loaded LLM: %s" % llm, file=stderr)
# llm = ChatOpenAI(model=args.m)
llm = ChatOllama(model=args.m)
if args.v: print(">>> Loaded LLM: %s" % llm, file=stderr)
#
# load documents
@ -59,7 +71,8 @@ def main():
loaders = {
"text": lambda file: TextLoader(file).load(),
"application/pdf": lambda file: PyPDFLoader(file).load(),
"application/pdf": lambda file: PyMuPDF4LLMLoader(file).load(),
# "application/pdf": lambda file: PyPDFLoader(file).load(),
"url": lambda file: WebBaseLoader(file).load(),
}
@ -69,8 +82,7 @@ def main():
for path in paths:
# check if url:
if urlparse(path).scheme in ("http", "https"):
if args.v:
print(">>> Loading %s as %s" % (path, "url"), file=stderr)
if args.v: print(">>> Loading %s as %s" % (path, "url"), file=stderr)
docs.extend(loaders["url"](path))
# check if file exists:
@ -86,25 +98,29 @@ def main():
if mimetype not in loaders:
raise ValueError("Unsupported file type: %s" % mimetype)
else:
if args.v:
print(">>> Loading %s as %s" % (path, mimetype), file=stderr)
if args.v: print(">>> Loading %s as %s" % (path, mimetype), file=stderr)
docs.extend(loaders[mimetype](path))
splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs)
if args.v:
print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr)
splits = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
).split_documents(docs)
if args.v: print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr)
# vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai))
vectorstore = InMemoryVectorStore(embedding=OpenAIEmbeddings())
vectorstore = InMemoryVectorStore(
embedding=OllamaEmbeddings(model='nomic-embed-text')
)
vectorstore.add_documents(splits)
if args.v:
print(">>> Vectorized %d chunks" % len(splits), file=stderr)
if args.v: print(">>> Vectorized %d chunks" % len(splits), file=stderr)
simple_retriever = vectorstore.as_retriever()
retriever = MultiQueryRetriever.from_llm(retriever=simple_retriever, llm=llm)
if args.v:
print(">>> Created retriever", file=stderr)
retriever = MultiQueryRetriever.from_llm(
retriever=simple_retriever,
llm=llm
)
if args.v: print(">>> Created retriever", file=stderr)
#
# History Prompt
@ -130,8 +146,7 @@ def main():
llm, retriever, contextualize_q_prompt
)
if args.v:
print(">>> Created history-aware retriever", file=stderr)
if args.v: print(">>> Created history-aware retriever", file=stderr)
#
# Prompt
@ -153,10 +168,11 @@ def main():
]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
if args.v:
print(">>> Created RAG chain", file=stderr)
rag_chain = create_retrieval_chain(
history_aware_retriever,
question_answer_chain
)
if args.v: print(">>> Created RAG chain", file=stderr)
#
# Memory
@ -186,8 +202,8 @@ def main():
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)
if args.v:
print(">>> Created app memory\n", file=stderr)
if args.v: print(">>> Created app memory\n", file=stderr)
#
# Chat
#
@ -195,12 +211,12 @@ def main():
while True:
try:
question = input(colored("Q: ", "yellow", attrs=["reverse"]))
question = input(colored("Q:", "yellow", attrs=["reverse"]) + " ")
except EOFError:
print()
break
print(colored("A: ", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question},
print(colored("A:", "green", attrs=["reverse"]), parse_markdown(app.invoke({"input": question},
config=config)["answer"]), end="\n\n")
# We define a dict representing the state of the application.
@ -228,25 +244,25 @@ def parse_markdown(text):
# Check for headers
if line.startswith("# "):
level = len(line) - len(line.lstrip("#"))
header_text = line.lstrip("#").strip()
header_text = line.strip() #.lstrip("#").strip()
formatted_text += colored(header_text, "blue", attrs=["bold", "underline"]) + "\n"
continue
if line.startswith("## "):
level = len(line) - len(line.lstrip("#"))
header_text = line.lstrip("#").strip()
header_text = line.strip() #.lstrip("#").strip()
formatted_text += colored(header_text, "blue", attrs=["bold"]) + "\n"
continue
if line.startswith("### "):
level = len(line) - len(line.lstrip("#"))
header_text = line.lstrip("#").strip()
header_text = line.strip() #.lstrip("#").strip()
formatted_text += colored(header_text, "cyan", attrs=["bold"]) + "\n"
continue
# Check for blockquotes
if line.startswith(">"):
quote_text = line.lstrip(">").strip()
quote_text = line.strip() #.lstrip(">").strip()
formatted_text += colored(quote_text, "yellow") + "\n"
continue
@ -276,4 +292,4 @@ def parse_markdown(text):
return formatted_text
if __name__ == "__main__":
main()
main()

View File

@ -3,9 +3,11 @@ gradio
huggingface_hub
langchain
langchain-community
langchain-openai
# langchain-openai
langchain-ollama
langchain-pymupdf4llm
langgraph
openai
pypdf==5.0.1
# pypdf==5.0.1
termcolor
tiktoken
tiktoken

View File

@ -1,7 +1,12 @@
async document loading
save conversation
editable input
toggleable rich text
initial question argument
no looping argument
async document loading
recursive directory reading
skip files @argument
proper markdown rendering
initial question @argument
no looping @argument
better code structure
huggingface models availability
UI
@ -10,4 +15,4 @@ x setup arguments
x multi-query retriever
x add verbose argument
x add model argument
x multi pdf
x multi pdf