| | from pathlib import Path |
| | from typing import List |
| |
|
| | from langchain.schema import Document |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain.vectorstores.chroma import Chroma |
| | from langchain_community.document_loaders import TextLoader |
| | from langchain_openai import OpenAIEmbeddings |
| |
|
| | import configs |
| |
|
| | embeddings_model = OpenAIEmbeddings() |
| |
|
| |
|
| | def process_documents(doc_storage_path: str): |
| | print("doc preprocessing...") |
| | doc_directory = Path(doc_storage_path) |
| | docs = [] |
| | text_splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP |
| | ) |
| | doc_search = Chroma( |
| | persist_directory=configs.STORE_FILE, embedding_function=embeddings_model |
| | ) |
| | for file_path in doc_directory.glob("*.txt"): |
| | loader = TextLoader(str(file_path)) |
| | documents = loader.load() |
| | docs = text_splitter.split_documents(documents) |
| | doc_search = doc_search.from_documents( |
| | docs, embeddings_model, persist_directory=configs.STORE_FILE |
| | ) |
| | doc_search.persist() |
| | print("doc preprocessing end.") |
| | return doc_search |
| |
|
| |
|
| | def format_docs(docs): |
| | return "\n\n".join([d.page_content for d in docs]) |
| |
|