image vision (cooks pcs)

2025-08-02 21:39:23 +02:00 · 2025-08-02 21:39:23 +02:00 · b3c772cc20
commit b3c772cc20
parent 8816fe5b39
4 changed files with 417 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -7,6 +7,8 @@ your friendly ai assistant. frontend for ollama.
 - git clone repository
 - install ollama from `https://ollama.ai/download`
 - pull a model from ollama (i recommend gemma3n:e4b for laptops like mine (i7-10750h + rtx 3050ti laptop edition))
+- for image stuff, you'll need a model like llava:7b
+- for webcam features, install fswebcam (Linux) or imagesnap (macOS)
 - copy config.example.toml to config.toml and edit it to have the model you selected, optionally set your name in [user]
 - npm i
 - node index.js
@ -31,10 +33,22 @@ lydia is written to be easily configurable through a toml file which is easier t
 - temperature = the temperature you want lydia to use. basically how random the model is. default is 0.8
 - max_tokens = the max context tokens you want lydia to use. basically how far she can remember. default is 8192

+## Camera settings
+- width = webcam capture width. default is 1280
+- height = webcam capture height. default is 720
+- quality = webcam capture quality (0-100). default is 100
+- device = specific camera device (false for default, or "/dev/video0", etc.)
+
 ## Runtime configuration
 the prompt can be changed by running l!prompt <text> in the chatbox. this only applies for the current session, if you want a persistent change, you can edit the config file.

+## Image & Webcam Commands
+- `l!image <path>` or `l!img <path>` - send an image file (or just `l!image` to browse)
+- `l!webcam` or `l!cam` - take and send a webcam snapshot
+- press `ESC` to open menu for image options
+
 # Other stuff
-by hitting escape you can tab out of the chatbox, here you can do cool things like:
+by hitting escape you can open the menu, here you can do cool things like:
+- send pictures and take webcam snapshots
+- get help and see all commands
 - hit Q or CTRL+C to quit lydia (but why would you wanna do that anyway?)
- yea thats it
--- a/config.example.toml
+++ b/config.example.toml
@ -13,3 +13,9 @@ name = "user"
 [advanced]
 temperature = 0.8
 max_tokens = 8192
+
+[camera]
+width = 1280
+height = 720
+quality = 100
+device = false # /dev/video0 on linux
--- a/lydia.js
+++ b/lydia.js
@ -3,6 +3,9 @@ import blessed from "blessed";
 import { execSync } from "child_process";
 import toml from "toml";
 import fs from "fs";
+import path from "path";
+import NodeWebcam from "node-webcam";
+import sharp from "sharp";

 if (!fs.existsSync("./config.toml")) {
  if (fs.existsSync("./config.example.toml")) {
@ -27,6 +30,12 @@ let username = config.user.name;

 let facefont = config.appearance.facefont;

+// camera settings
+let cameraWidth = config.camera?.width || 1280;
+let cameraHeight = config.camera?.height || 720;
+let cameraQuality = config.camera?.quality || 100;
+let cameraDevice = config.camera?.device || false;
+
 let systemprompt = config.assistant.system_prompt
  .replace("${name}", assistantname)
  .replace("${username}", username);
@ -81,15 +90,87 @@ const inputBox = blessed.textbox({
  mouse: true,
  placeholder: `go on, tell ${assistantname} something!`,
 });
+
+const menuBox = blessed.list({
+  top: "center",
+  left: "center",
+  width: 60,
+  height: 16,
+  border: {
+    type: "line",
+  },
+  style: {
+    border: {
+      fg: "cyan",
+    },
+    bg: "black",
+    selected: {
+      bg: "cyan",
+      fg: "black",
+    },
+    item: {
+      fg: "white",
+    },
+  },
+  keys: true,
+  vi: true,
+  mouse: true,
+  hidden: true,
+  label: ` ${assistantname} menu `,
+  items: ["help & commands", "send picture", "take webcam snapshot", `exit ${assistantname}`],
+});
+
+const popup = blessed.box({
+  parent: screen,
+  top: "center",
+  left: "center",
+  width: 40,
+  height: 8,
+  border: {
+    type: "line",
+  },
+  style: {
+    border: {
+      fg: "cyan",
+    },
+    bg: "black",
+  },
+  tags: true,
+  hidden: true,
+  content: "",
+});
+
+const popupButton = blessed.button({
+  parent: popup,
+  bottom: 1,
+  left: "center",
+  width: 8,
+  height: 1,
+  content: "OK", // < have no clue how to center this
+  style: {
+    bg: "cyan",
+    fg: "black",
+    focus: {
+      bg: "white",
+      fg: "black",
+    },
+  },
+  mouse: true,
+  keys: true,
+});
+
 screen.append(faceBox);
 screen.append(chatBox);
 screen.append(inputBox);
+screen.append(menuBox);
+screen.append(popup);

 inputBox.focus();

 let chatHistory = [];
 let conversationHistory = [];
 let currentStreamMessage = "";
+let menuVisible = false;

 function addMessage(role, content) {
  let message;
@ -204,6 +285,219 @@ function clearChatHistory() {
  screen.render();
 }

+// webcam configuration
+const webcamOptions = {
+  width: cameraWidth,
+  height: cameraHeight,
+  quality: cameraQuality,
+  delay: 0,
+  saveShots: true,
+  output: "jpeg",
+  device: cameraDevice,
+  callbackReturn: "location"
+};
+
+const webcam = NodeWebcam.create(webcamOptions);
+
+async function convertImageToBase64(imagePath) {
+  try {
+    // resize image to reasonable size for vision models
+    const buffer = await sharp(imagePath)
+      .resize(800, 600, { fit: 'inside', withoutEnlargement: true })
+      .jpeg({ quality: 80 })
+      .toBuffer();
+    
+    return buffer.toString('base64');
+  } catch (error) {
+    throw new Error(`Failed to process image: ${error.message}`);
+  }
+}
+
+async function sendImageFile() {
+  return new Promise((resolve) => {
+    // Create a simple input box for file path
+    const fileInput = blessed.textbox({
+      parent: screen,
+      top: 'center',
+      left: 'center',
+      width: '80%',
+      height: 5,
+      border: {
+        type: 'line'
+      },
+      style: {
+        border: {
+          fg: 'cyan'
+        },
+        bg: 'black'
+      },
+      inputOnFocus: true,
+      keys: true,
+      mouse: true,
+      label: ' Enter Image File Path (ESC to cancel) ',
+      placeholder: 'Enter full path to image file...'
+    });
+
+    screen.append(fileInput);
+    fileInput.focus();
+    screen.render();
+
+    fileInput.on('submit', async (filePath) => {
+      fileInput.destroy();
+      
+      if (!filePath || !filePath.trim()) {
+        resolve();
+        return;
+      }
+
+      const trimmedPath = filePath.trim();
+      
+      // Check if file exists
+      if (!fs.existsSync(trimmedPath)) {
+        showPopup('Error: File does not exist');
+        resolve();
+        return;
+      }
+
+      const ext = path.extname(trimmedPath).toLowerCase();
+      const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'];
+      
+      if (!imageExtensions.includes(ext)) {
+        showPopup('Error: Please select a valid image file\n(.jpg, .png, .gif, .bmp, .webp)');
+        resolve();
+        return;
+      }
+
+      try {
+        const base64Image = await convertImageToBase64(trimmedPath);
+        await sendMessageWithImage("Here's an image I'd like you to look at:", base64Image);
+        addMessage(username, `[Sent image: ${path.basename(trimmedPath)}]`);
+      } catch (error) {
+        showPopup(`Error processing image: ${error.message}`);
+      }
+
+      resolve();
+    });
+
+    fileInput.on('cancel', () => {
+      fileInput.destroy();
+      resolve();
+    });
+
+    screen.key(['escape'], () => {
+      if (!fileInput.destroyed) {
+        fileInput.destroy();
+        resolve();
+      }
+    });
+  });
+}
+
+async function takeWebcamSnapshot() {
+  return new Promise((resolve) => {
+    showPopup('Taking webcam snapshot...\nPlease wait...');
+    
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+    const filename = `webcam-${timestamp}.jpg`;
+    const filepath = path.join(process.cwd(), filename);
+
+    webcam.capture(filename, async (err, data) => {
+      popup.hide();
+      
+      if (err) {
+        let errorMsg = 'Webcam error occurred';
+        if (err.message) {
+          errorMsg = `Webcam error: ${err.message}`;
+        }
+        if (err.message && err.message.includes('No such file or directory')) {
+          errorMsg += '\n\nTip: Make sure you have a webcam connected\nand try installing fswebcam or imagesnap';
+        }
+        showPopup(errorMsg);
+        resolve();
+        return;
+      }
+
+      try {
+        // check if file exists with a small delay
+        await new Promise(r => setTimeout(r, 500));
+        
+        if (!fs.existsSync(filepath)) {
+          showPopup('Error: Snapshot file was not created\nCheck if webcam is connected and accessible');
+          resolve();
+          return;
+        }
+
+        const base64Image = await convertImageToBase64(filepath);
+        await sendMessageWithImage("I just took this webcam snapshot:", base64Image);
+        addMessage(username, `[Took webcam snapshot: ${filename}]`);
+        
+        // clean up the temporary file
+        try {
+          fs.unlinkSync(filepath);
+        } catch (unlinkErr) {
+          // ignore cleanup errors
+        }
+      } catch (error) {
+        showPopup(`Error processing snapshot: ${error.message}`);
+        // try to clean up file even if there was an error
+        try {
+          if (fs.existsSync(filepath)) {
+            fs.unlinkSync(filepath);
+          }
+        } catch (unlinkErr) {
+          // ignore cleanup errors
+        }
+      }
+
+      resolve();
+    });
+  });
+}
+
+function showPopup(message) {
+  popup.setContent(`{center}${message}{/center}`);
+  popup.show();
+  popupButton.focus();
+  screen.render();
+}
+
+function showMenu() {
+  menuVisible = true;
+  menuBox.show();
+  menuBox.focus();
+  screen.render();
+}
+
+function hideMenu() {
+  menuVisible = false;
+  menuBox.hide();
+  inputBox.focus();
+  screen.render();
+}
+
+async function handleMenuSelection() {
+  const selectedIndex = menuBox.selected;
+  hideMenu();
+
+  switch (selectedIndex) {
+    case 0:
+      addMessage(
+        assistantname,
+        "available commands:\nl!help - if you wanna know what i can do, run this!\nl!clear - clear chat history, if you want me to forget everything, just run this!\nl!face <text> - if you want to force my expression, here you go! not sure i'll be too happy about it though.\nl!prompt <text> - if you want to change how i act, here you go! not sure i'll be too happy about that either.\nl!image <path> or l!img <path> - send an image file (or just l!image to browse)\nl!webcam or l!cam - take and send a webcam snapshot\n\nMenu shortcuts:\nESC - open/close menu\nUp/Down arrows - navigate menu\nEnter - select option\nCtrl+C or q - quit application\n\nMenu options:\n- help & commands - show this help\n- send picture - browse and send an image file\n- take webcam snapshot - capture and send a webcam photo\n- exit - quit the application\n\nNote: Image features require a vision-capable model like llava or bakllava in Ollama!\n",
+      );
+      break;
+    case 1:
+      await sendImageFile();
+      break;
+    case 2:
+      await takeWebcamSnapshot();
+      break;
+    case 3:
+      process.exit(0);
+      break;
+  }
+}
+
 async function sendMessage(message) {
  if (!message.trim()) return;

@ -246,6 +540,47 @@ async function sendMessage(message) {
  }
 }

+async function sendMessageWithImage(message, base64Image) {
+  if (!message.trim()) return;
+
+  conversationHistory.push({
+    role: "user",
+    content: message,
+    images: [base64Image]
+  });
+
+  try {
+    currentStreamMessage = "";
+
+    const response = await ollama.chat({
+      model: assistantmodel,
+      messages: [
+        {
+          role: "system",
+          content: systemprompt,
+        },
+        ...conversationHistory,
+      ],
+      stream: true,
+      options: {
+        num_predict: maxtokens,
+        temperature: temperature,
+      },
+    });
+
+    for await (const part of response) {
+      if (part.message && part.message.content) {
+        currentStreamMessage += part.message.content;
+        updateStreamMessage(currentStreamMessage);
+      }
+    }
+
+    finalizeStreamMessage();
+  } catch (error) {
+    addMessage(assistantname, `Failed to get response: ${error.message}`);
+  }
+}
+
 inputBox.on("submit", async (text) => {
  if (text.trim()) {
    inputBox.clearValue();
@ -261,7 +596,7 @@ inputBox.on("submit", async (text) => {
        case "help":
          addMessage(
            assistantname,
-            "available commands:\nl!help - if you wanna know what i can do, run this!\nl!clear - clear chat history, if you want me to forget everything, just run this!\nl!face <text> - if you want to force my expression, here you go! not sure i'll be too happy about it though.\nl!prompt <text> - if you want to change how i act, here you go! not sure i'll be too happy about that either.\n",
+            "available commands:\nl!help - if you wanna know what i can do, run this!\nl!clear - clear chat history, if you want me to forget everything, just run this!\nl!face <text> - if you want to force my expression, here you go! not sure i'll be too happy about it though.\nl!prompt <text> - if you want to change how i act, here you go! not sure i'll be too happy about that either.\nl!image <path> or l!img <path> - send an image file (or just l!image to browse)\nl!webcam or l!cam - take and send a webcam snapshot\n\nMenu shortcuts:\nESC - open/close menu\nUp/Down arrows - navigate menu\nEnter - select option\n\nMenu options:\n- help & commands - show this help\n- send picture - browse and send an image file\n- take webcam snapshot - capture and send a webcam photo\n- exit - quit the application\n",
          );
          break;
        case "clear":
@ -283,6 +618,35 @@ inputBox.on("submit", async (text) => {
            addMessage("system", "SYSTEM: You didn't provide a prompt.");
          }
          break;
+        case "image":
+        case "img":
+          if (args.trim()) {
+            const imagePath = args.trim();
+            if (!fs.existsSync(imagePath)) {
+              addMessage("system", "SYSTEM: Image file not found.");
+              break;
+            }
+            const ext = path.extname(imagePath).toLowerCase();
+            const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'];
+            if (!imageExtensions.includes(ext)) {
+              addMessage("system", "SYSTEM: Invalid image file format.");
+              break;
+            }
+            try {
+              const base64Image = await convertImageToBase64(imagePath);
+              await sendMessageWithImage("Here's an image I'd like you to look at:", base64Image);
+              addMessage(username, `[Sent image: ${path.basename(imagePath)}]`);
+            } catch (error) {
+              addMessage("system", `SYSTEM: Error processing image: ${error.message}`);
+            }
+          } else {
+            await sendImageFile();
+          }
+          break;
+        case "webcam":
+        case "cam":
+          await takeWebcamSnapshot();
+          break;
        default:
          addMessage(assistantname, `unknown command: ${command}`);
          break;
@ -299,7 +663,33 @@ screen.key(["q", "C-c"], () => {
 });

 screen.key(["escape"], () => {
-  // menu soon when i decide that i wanna do it
+  if (menuVisible) {
+    hideMenu();
+  } else {
+    showMenu();
+  }
+});
+
+menuBox.on("select", async (item, selected) => {
+  if (menuVisible) {
+    await handleMenuSelection();
+  }
+});
+
+popupButton.on("press", () => {
+  popup.hide();
+  inputBox.focus();
+  screen.render();
+});
+
+screen.key(["enter"], async () => {
+  if (!popup.hidden) {
+    popup.hide();
+    inputBox.focus();
+    screen.render();
+  } else if (menuVisible) {
+    await handleMenuSelection();
+  }
 });

 screen.on("resize", () => {
--- a/package.json
+++ b/package.json
@ -7,6 +7,8 @@
    "blessed": "^0.1.81",
    "figlet": "^1.8.2",
    "ollama": "^0.5.16",
-    "toml": "^3.0.0"
+    "toml": "^3.0.0",
+    "node-webcam": "^0.8.1",
+    "sharp": "^0.33.1"
  }
 }