embed knowledge using chromadb embeddings

2024-03-03 23:04:16 -05:00 · 2024-03-03 23:04:16 -05:00 · cda0ea2f1a
parent 109b974ece
commit cda0ea2f1a
3 changed files with 2230 additions and 2 deletions
--- a/discollama.py
+++ b/discollama.py
@ -6,6 +6,7 @@ import argparse
 from datetime import datetime, timedelta

 import ollama
+import chromadb
 import discord
 import redis

@ -46,11 +47,12 @@ class Response:


 class Discollama:
-  def __init__(self, ollama, discord, redis, model):
+  def __init__(self, ollama, discord, redis, model, collection):
    self.ollama = ollama
    self.discord = discord
    self.redis = redis
    self.model = model
+    self.collection = collection

    # register event handlers
    self.discord.event(self.on_ready)
@ -101,6 +103,29 @@ class Discollama:
          ]
        )
    
+    # retrieve relevant context from vector store
+    knowledge = self.collection.query(
+      query_texts=[content],
+      n_results=2
+    )
+    # directly unpack the first list of documents if it exists, or use an empty list
+    documents = knowledge.get('documents', [[]])[0]
+
+    content = '\n'.join(
+      [
+        'Using the provided document, answer the user question to the best of your ability. You must try to use information from the provided document. Combine information in the document into a coherent answer.',
+        'If there is nothing in the document relevant to the user question, say \'Hmm, I don\'t know about that, try referencing the docs.\', before providing any other information you know.',
+        'Anything between the following `document` html blocks is retrieved from a knowledge bank, not part of the conversation with the user.',
+        '<document>',
+        '\n'.join(documents) if documents else '',
+        '</document>',
+        'Anything between the following `user` html blocks is part of the conversation with the user.',
+        '<user>',
+        content,
+        '</user>',
+      ]
+    )
+
    if not context:
      context = await self.load(channel_id=channel.id)

@ -157,6 +182,35 @@ class Discollama:
      self.redis.close()


+def embed_data(collection):
+  logging.info('embedding data...')
+  documents = []
+  ids = []
+  # read all data from the data folder
+  for filename in os.listdir('data'):
+    if filename.endswith('.json'):
+      filepath = os.path.join('data', filename)
+      with open(filepath, 'r') as file:
+        try:
+          data = json.load(file)
+          if isinstance(data, list):
+            for index, item in enumerate(data):
+              documents.append(item)
+              file_id = f"{filename.rsplit('.', 1)[0]}-{index}"
+              ids.append(file_id)
+          else:
+            logging.warning("The file {filename} is not a JSON array.")
+        except json.JSONDecodeError as e:
+          logging.exception(f"Error decoding JSON from file {filename}: {e}")
+        except Exception as e:
+          logging.exception(f"An error occurred while processing file {filename}: {e}")
+  # store the data in chroma for look-up
+  collection.add(
+    documents=documents,
+    ids=ids,
+  )
+
+
 def main():
  parser = argparse.ArgumentParser()

@ -175,11 +229,16 @@ def main():
  intents = discord.Intents.default()
  intents.message_content = True

+  chroma = chromadb.Client()
+  collection = chroma.get_or_create_collection(name='discollama')
+  embed_data(collection)
+
  Discollama(
    ollama.AsyncClient(host=f'{args.ollama_scheme}://{args.ollama_host}:{args.ollama_port}'),
    discord.Client(intents=intents),
    redis.Redis(host=args.redis_host, port=args.redis_port, db=0, decode_responses=True),
    model=args.ollama_model,
+    collection=collection,
  ).run(os.environ['DISCORD_TOKEN'])


--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,6 +10,7 @@ python = "^3.11"
 discord-py = "^2.3.1"
 redis = "^5.0.1"
 ollama = "^0.1.0"
+chromadb = "^0.4.24"

 [build-system]
 requires = ["poetry-core"]