added splitting options

2025-11-10 13:31:25 +08:00 · 2025-11-10 13:31:25 +08:00 · 91241a597b
parent cf4fdb6fb2
commit 91241a597b
1 changed files with 47 additions and 40 deletions
--- a/ragger.py
+++ b/ragger.py
@ -12,20 +12,20 @@ import re
 import readline

 from argparse import ArgumentParser
-from langchain import hub
+# from langchain import hub
 from langchain.chains import create_history_aware_retriever, create_retrieval_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.retrievers.multi_query import MultiQueryRetriever
-from langchain_community.chat_message_histories import SQLChatMessageHistory
+# from langchain_community.chat_message_histories import SQLChatMessageHistory
 from langchain_community.document_loaders import TextLoader, WebBaseLoader #, PyPDFLoader
 from langchain_pymupdf4llm import PyMuPDF4LLMLoader
-from langchain_core import vectorstores
-from langchain_core.documents import Document
+# from langchain_core import vectorstores
+# from langchain_core.documents import Document
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
-from langchain_core.output_parsers import StrOutputParser
+# from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate
-from langchain_core.runnables import RunnablePassthrough
-from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_core.runnables import RunnableConfig #, RunnablePassthrough
+# from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_core.vectorstores import InMemoryVectorStore
 # from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_ollama import OllamaEmbeddings, ChatOllama 
@ -35,7 +35,7 @@ from langgraph.graph import START, StateGraph
 from langgraph.graph.message import add_messages
 from sys import stderr
 from termcolor import colored
-from typing import Sequence
+from typing import NotRequired, Sequence
 from typing_extensions import Annotated, TypedDict
 from urllib.parse import urlparse
 from termcolor import colored
@ -56,6 +56,11 @@ def main():
        help="select language model to use",
        default="gpt-oss"
    )
+    parser.add_argument(
+        "-s", 
+        help="don't split documents",
+        action="store_true"
+    )
    args, paths = parser.parse_known_args()

    #
@ -68,12 +73,23 @@ def main():
    #
    # load documents
    #
-    
+
+    splitter_func = lambda docs: docs
+    if not args.s:
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
+        splitter_func = lambda docs: splitter.split_documents(docs)
+
+
+    if args.s: pdf_mode = 'single'
+    else: pdf_mode = 'page'
    loaders = {
-        "text": lambda file: TextLoader(file).load(),
-        "application/pdf": lambda file: PyMuPDF4LLMLoader(file).load(),
+        "text": lambda file: splitter_func(TextLoader(file).load()),
+        "application/pdf": lambda file: PyMuPDF4LLMLoader(file, mode=pdf_mode).load(),
        # "application/pdf": lambda file: PyPDFLoader(file).load(),
-        "url": lambda file: WebBaseLoader(file).load(),
+        "url": lambda file: splitter_func(WebBaseLoader(file).load()),
    }

 #    docs = PyPDFLoader(paths[0]).load()
@ -92,7 +108,7 @@ def main():
        # detect filetype
        else:
            mimetype, _ = mimetypes.guess_type(path)
-            if mimetype.startswith("text/"):
+            if (mimetype or "").startswith("text/"):
                mimetype = "text"

            if mimetype not in loaders:
@ -101,19 +117,14 @@ def main():
                if args.v: print(">>> Loading %s as %s" % (path, mimetype), file=stderr)
                docs.extend(loaders[mimetype](path))

-    splits = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200
-    ).split_documents(docs)
-    if args.v: print(">>> Split %d documents into %d chunks" % (len(docs), len(splits)), file=stderr)
-
-    # vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai))

+        # vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=APIKeys.openai))
+    
    vectorstore = InMemoryVectorStore(
        embedding=OllamaEmbeddings(model='nomic-embed-text')
    )
-    vectorstore.add_documents(splits)
-    if args.v: print(">>> Vectorized %d chunks" % len(splits), file=stderr)
+    vectorstore.add_documents(docs)
+    if args.v: print(">>> Vectorized %d chunks" % len(docs), file=stderr)

    simple_retriever = vectorstore.as_retriever()
    retriever = MultiQueryRetriever.from_llm(
@ -140,12 +151,11 @@ def main():
            ("human", "{input}"),
        ]
    )
-    
-    
+
+
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
-
    if args.v: print(">>> Created history-aware retriever", file=stderr)

    #
@ -159,7 +169,7 @@ def main():
        "\n\n"
        "{context}"
    )
-        
+
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
@ -203,11 +213,11 @@ def main():
    app = workflow.compile(checkpointer=memory)

    if args.v: print(">>> Created app memory\n", file=stderr)
-    
+
    #
    # Chat
    #
-    config = {"configurable": {"thread_id": "abc123"}}
+    config: RunnableConfig  = {"configurable": {"thread_id": "abc123"}}

    while True:
        try:
@ -223,9 +233,9 @@ def main():
 # This state has the same input and output keys as `rag_chain`.
 class State(TypedDict):
    input: str
-    chat_history: Annotated[Sequence[BaseMessage], add_messages]
-    context: str
-    answer: str
+    chat_history: NotRequired[Annotated[Sequence[BaseMessage], add_messages]]
+    context: NotRequired[str]
+    answer: NotRequired[str]

 def parse_markdown(text):
    lines = text.splitlines()
@ -243,26 +253,23 @@ def parse_markdown(text):

        # Check for headers
        if line.startswith("# "):
-            level = len(line) - len(line.lstrip("#"))
-            header_text = line.strip() #.lstrip("#").strip()
+            header_text = line.lstrip("#").strip()
            formatted_text += colored(header_text, "blue", attrs=["bold", "underline"]) + "\n"
            continue
-        
+
        if line.startswith("## "):
-            level = len(line) - len(line.lstrip("#"))
-            header_text = line.strip() #.lstrip("#").strip()
+            header_text = line.lstrip("#").strip()
            formatted_text += colored(header_text, "blue", attrs=["bold"]) + "\n"
            continue
-        
+
        if line.startswith("### "):
-            level = len(line) - len(line.lstrip("#"))
-            header_text = line.strip() #.lstrip("#").strip()
+            header_text = line.lstrip("#").strip()
            formatted_text += colored(header_text, "cyan", attrs=["bold"]) + "\n"
            continue

        # Check for blockquotes
        if line.startswith(">"):
-            quote_text = line.strip() #.lstrip(">").strip()
+            quote_text = line.lstrip(">").strip()
            formatted_text += colored(quote_text, "yellow") + "\n"
            continue