Merge 44578649c2 into af6d4c2711

2024-03-03 23:13:22 -05:00 · 2024-03-03 23:13:22 -05:00 · 205618891f
parent af6d4c2711 44578649c2
commit 205618891f
7 changed files with 7971 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.DS_Store
--- a/16
+++ b/16
@ -1,11 +1,21 @@
-FROM python:3.11.6-alpine
+FROM python:3.12.2-slim-bookworm
+
+# Install system dependencies required for Python packages
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libffi-dev \
+    && rm -rf /var/lib/apt/lists/*

-RUN apk add --no-cache build-base libffi-dev
 RUN pip install poetry

 WORKDIR /mnt
-COPY pyproject.toml poetry.lock .
+
+# Copy only the files needed for the poetry installation to avoid cache invalidation
+COPY pyproject.toml poetry.lock ./
+
 RUN poetry install --no-root --only main

+# Copy the application
 COPY . .
+
 ENTRYPOINT ["poetry", "run", "python", "discollama.py"]
--- a/compose.yaml
+++ b/compose.yaml
@ -19,3 +19,14 @@ services:
      - /data
    ports:
      - 6379
+  
+  chroma:
+    image: ghcr.io/chroma-core/chroma:latest
+    volumes:
+      - index_data:/chroma/.chroma/index
+    ports:
+      - 8000:8000
+
+volumes:
+  index_data:
+    driver: local
--- a/data/qa.json
+++ b/data/qa.json
--- a/discollama.py
+++ b/discollama.py
@ -6,6 +6,7 @@ import argparse
 from datetime import datetime, timedelta

 import ollama
+import chromadb
 import discord
 import redis

@ -46,11 +47,12 @@ class Response:


 class Discollama:
-  def __init__(self, ollama, discord, redis, model):
+  def __init__(self, ollama, discord, redis, model, collection):
    self.ollama = ollama
    self.discord = discord
    self.redis = redis
    self.model = model
+    self.collection = collection

    # register event handlers
    self.discord.event(self.on_ready)
@ -100,6 +102,29 @@ class Discollama:
            reference_message.content,
          ]
        )
+    
+    # retrieve relevant context from vector store
+    knowledge = self.collection.query(
+      query_texts=[content],
+      n_results=2
+    )
+    # directly unpack the first list of documents if it exists, or use an empty list
+    documents = knowledge.get('documents', [[]])[0]
+
+    content = '\n'.join(
+      [
+        'Using the provided document, answer the user question to the best of your ability. You must try to use information from the provided document. Combine information in the document into a coherent answer.',
+        'If there is nothing in the document relevant to the user question, say \'Hmm, I don\'t know about that, try referencing the docs.\', before providing any other information you know.',
+        'Anything between the following `document` html blocks is retrieved from a knowledge bank, not part of the conversation with the user.',
+        '<document>',
+        '\n'.join(documents) if documents else '',
+        '</document>',
+        'Anything between the following `user` html blocks is part of the conversation with the user.',
+        '<user>',
+        content,
+        '</user>',
+      ]
+    )

    if not context:
      context = await self.load(channel_id=channel.id)
@ -152,10 +177,40 @@ class Discollama:
  def run(self, token):
    try:
      self.discord.run(token)
-    except Exception:
+    except Exception as e:
+      logging.exception("An error occurred while running the bot: %s", e)
      self.redis.close()


+def embed_data(collection):
+  logging.info('embedding data...')
+  documents = []
+  ids = []
+  # read all data from the data folder
+  for filename in os.listdir('data'):
+    if filename.endswith('.json'):
+      filepath = os.path.join('data', filename)
+      with open(filepath, 'r') as file:
+        try:
+          data = json.load(file)
+          if isinstance(data, list):
+            for index, item in enumerate(data):
+              documents.append(item)
+              file_id = f"{filename.rsplit('.', 1)[0]}-{index}"
+              ids.append(file_id)
+          else:
+            logging.warning("The file {filename} is not a JSON array.")
+        except json.JSONDecodeError as e:
+          logging.exception(f"Error decoding JSON from file {filename}: {e}")
+        except Exception as e:
+          logging.exception(f"An error occurred while processing file {filename}: {e}")
+  # store the data in chroma for look-up
+  collection.add(
+    documents=documents,
+    ids=ids,
+  )
+
+
 def main():
  parser = argparse.ArgumentParser()

@ -174,11 +229,16 @@ def main():
  intents = discord.Intents.default()
  intents.message_content = True

+  chroma = chromadb.Client()
+  collection = chroma.get_or_create_collection(name='discollama')
+  embed_data(collection)
+
  Discollama(
    ollama.AsyncClient(host=f'{args.ollama_scheme}://{args.ollama_host}:{args.ollama_port}'),
    discord.Client(intents=intents),
    redis.Redis(host=args.redis_host, port=args.redis_port, db=0, decode_responses=True),
    model=args.ollama_model,
+    collection=collection,
  ).run(os.environ['DISCORD_TOKEN'])


--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,6 +10,7 @@ python = "^3.11"
 discord-py = "^2.3.1"
 redis = "^5.0.1"
 ollama = "^0.1.0"
+chromadb = "^0.4.24"

 [build-system]
 requires = ["poetry-core"]