AnythingLLM 🎤 Voice Recording (Local Whisper Integration)

🎤 AnythingLLM – Local Whisper Recording Integration

Add a Record button to AnythingLLM that lets you dictate messages using your microphone. The audio is sent to your own local Whisper server for transcription — no OpenAI API required.

🧩 Requirements

  • A running Whisper ASR endpoint (example: https://whisper.parsons.familyds.net/asr)
  • A supported desktop browser: Chrome, Firefox, Edge, or Brave
  • The Tampermonkey browser extension installed

⚙️ Installation Steps

  • Install the Tampermonkey extension for your browser.
  • Click the Tampermonkey icon → Create a new script…
  • Delete any existing placeholder code.
  • Paste the full script below.
  • Save the script (File → Save or Ctrl+S).
  • Visit your AnythingLLM site (or refresh it). You should see a 🎤 Record button next to the chat input.
  • Click once to record, click again to stop — your transcribed text appears automatically. Hold Shift while stopping to auto-send.

🧾 Full Script — AnythingLLM 🎤 Record (v1.3)

// ==UserScript==
// @name         AnythingLLM – 🎤 Record (local Whisper)
// @namespace    https://parsons.family
// @version      1.3
// @description  Adds a mic record button to AnythingLLM; transcribes via local Whisper; no OpenAI.
// @match        https://anythingllm.parsons.familyds.net/*
// @match        http://localhost:3001/*
// @match        http://192.168.1.200:3001/*
// @match        http://ai-server:3001/*
// @grant        none
// @run-at       document-idle
// ==/UserScript==

(function () {
  'use strict';

  const ASR_URL = 'https://whisper.parsons.familyds.net/asr';

  const textareaSetter =
    Object.getOwnPropertyDescriptor(HTMLTextAreaElement.prototype, 'value')?.set;
  const inputSetter =
    Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value')?.set;

  function setReactValue(el, val) {
    if (el instanceof HTMLTextAreaElement && textareaSetter) textareaSetter.call(el, val);
    else if (el instanceof HTMLInputElement && inputSetter) inputSetter.call(el, val);
    else if (el.isContentEditable) {}
    else el.value = val;

    try {
      el.dispatchEvent(new InputEvent('beforeinput', {
        inputType: 'insertText', data: val, bubbles: true, cancelable: true, composed: true
      }));
    } catch {}
    el.dispatchEvent(new Event('input', { bubbles: true, cancelable: true, composed: true }));
    el.dispatchEvent(new Event('change', { bubbles: true }));
  }

  function getValue(el) {
    if (el instanceof HTMLTextAreaElement || el instanceof HTMLInputElement) return el.value || '';
    if (el.isContentEditable) return el.innerText || '';
    return '';
  }

  function insertIntoContentEditable(el, text) {
    el.focus();
    const sel = el.ownerDocument.getSelection();
    if (!sel || sel.rangeCount === 0) {
      el.appendChild(el.ownerDocument.createTextNode(text));
    } else {
      const range = sel.getRangeAt(0);
      range.deleteContents();
      range.insertNode(el.ownerDocument.createTextNode(text));
      range.collapse(false);
      sel.removeAllRanges();
      sel.addRange(range);
    }
    el.dispatchEvent(new InputEvent('beforeinput', {
      inputType: 'insertText', data: text, bubbles: true, cancelable: true, composed: true
    }));
    el.dispatchEvent(new Event('input', { bubbles: true }));
    el.dispatchEvent(new Event('change', { bubbles: true }));
  }

  function isVisible(el) {
    const s = getComputedStyle(el);
    const r = el.getBoundingClientRect();
    return s.display !== 'none' && s.visibility !== 'hidden' && r.width > 200 && r.height > 30;
  }
  function byArea(a, b) {
    const ra = a.getBoundingClientRect(); const rb = b.getBoundingClientRect();
    return (rb.width * rb.height) - (ra.width * ra.height);
  }

  function queryAllCandidates(rootDoc) {
    const selectors = [
      'textarea[placeholder*="message" i]',
      'textarea[aria-label*="message" i]',
      'textarea[aria-multiline="true"]',
      'textarea',
      '[role="textbox"][contenteditable="true"]',
      '.ProseMirror[contenteditable="true"]',
      '.ql-editor[contenteditable="true"]'
    ];
    let out = [];
    selectors.forEach(sel => out.push(...rootDoc.querySelectorAll(sel)));
    return out.filter(isVisible).sort(byArea);
  }

  function findChatInput() {
    const topCands = queryAllCandidates(document);
    if (topCands.length) return { el: topCands[0], doc: document };
    for (const frame of Array.from(document.querySelectorAll('iframe'))) {
      try {
        const d = frame.contentDocument;
        if (!d) continue;
        const cands = queryAllCandidates(d);
        if (cands.length) return { el: cands[0], doc: d };
      } catch {}
    }
    return null;
  }

  function ensureButton() {
    if (document.getElementById('whisper-record-btn')) return;
    const found = findChatInput();
    if (!found) return;
    const { el } = found;

    const btn = document.createElement('button');
    btn.id = 'whisper-record-btn';
    btn.type = 'button';
    btn.textContent = '🎤 Record';
    Object.assign(btn.style, {
      marginLeft: '8px',
      padding: '6px 10px',
      borderRadius: '10px',
      border: '1px solid #ccc',
      cursor: 'pointer',
      fontSize: '0.95rem'
    });

    el.insertAdjacentElement('afterend', btn);
    wireRecorder(btn, el);
    console.log('🎤 Record button injected next to', el);
  }

  function wireRecorder(btn, inputEl) {
    let rec = null;
    let chunks = [];
    let stoppingEvent = null;

    btn.addEventListener('click', async (ev) => {
      if (!rec) {
        try {
          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
          chunks = [];
          rec = new MediaRecorder(stream, { mimeType: 'audio/webm' });
          rec.ondataavailable = e => e.data && e.data.size && chunks.push(e.data);
          rec.onstop = () => doTranscribe(btn, inputEl, chunks, stoppingEvent).finally(() => {
            rec.stream.getTracks().forEach(t => t.stop());
            rec = null; chunks = []; stoppingEvent = null;
            btn.disabled = false; btn.textContent = '🎤 Record';
          });
          rec.start();
          btn.textContent = '⏹ Stop (Shift=auto-send)';
        } catch (e) {
          console.error(e);
          alert('Mic permission denied or unavailable.');
        }
      } else {
        stoppingEvent = ev;
        btn.disabled = true;
        btn.textContent = '…processing';
        rec.stop();
      }
    });
  }

  async function doTranscribe(btn, inputFallback, chunks, ev) {
    try {
      const blob = new Blob(chunks, { type: 'audio/webm' });

      const isAsr = /\/asr(\?|$)/.test(ASR_URL);
      const form = new FormData();
      let url = ASR_URL;

      if (isAsr) {
        url += (ASR_URL.includes('?') ? '&' : '?') + 'task=transcribe&language=en&output=json';
        form.append('audio_file', blob, 'recording.webm');
      } else {
        if (!/\/transcribe(\?|$)/.test(ASR_URL)) url = ASR_URL.replace(/\/$/, '') + '/transcribe';
        form.append('file', blob, 'recording.webm');
      }

      const res = await fetch(url, { method: 'POST', body: form });
      if (!res.ok) throw new Error(`ASR ${res.status} ${res.statusText}`);

      const raw = await res.text();
      let parsed; try { parsed = JSON.parse(raw); } catch { parsed = { text: raw }; }

      let text = '';
      if (typeof parsed === 'string') text = parsed;
      else if (parsed && typeof parsed.text === 'string') text = parsed.text;
      else if (parsed && typeof parsed.transcription === 'string') text = parsed.transcription;
      else if (Array.isArray(parsed) && parsed[0]?.text) text = parsed[0].text;
      text = (text || '').trim();

      const found = findChatInput();
      const target = (found && found.el) || inputFallback || document.activeElement;

      if (target && text) {
        const prefix = (getValue(target).endsWith(' ') || getValue(target) === '' ? '' : ' ');
        insertAtCursor(target, prefix + text);
        if (ev && ev.shiftKey) trySend(target);
      } else {
        if (navigator.clipboard && text) {
          await navigator.clipboard.writeText(text);
          alert('Transcript copied to clipboard (paste into AnythingLLM input).');
        } else {
          alert('Transcribed text:\n\n' + text);
        }
      }
    } catch (e) {
      console.error('ASR error:', e);
      alert('Transcription failed: ' + e.message + '\n(Check CORS and ASR_URL.)');
    }
  }

  function insertAtCursor(el, text) {
    el.focus();
    if (el instanceof HTMLTextAreaElement || el instanceof HTMLInputElement) {
      const start = el.selectionStart ?? getValue(el).length;
      const end = el.selectionEnd ?? getValue(el).length;
      const before = getValue(el).slice(0, start);
      const after = getValue(el).slice(end);
      setReactValue(el, before + text + after);
      const caret = start + text.length;
      el.selectionStart = el.selectionEnd = caret;
    } else if (el.isContentEditable) {
      insertIntoContentEditable(el, text);
    } else {
      setReactValue(el, (getValue(el) || '') + text);
    }
  }

  function trySend(input) {
    const ke = new KeyboardEvent('keydown', { bubbles: true, cancelable: true, key: 'Enter', code: 'Enter' });
    input.dispatchEvent(ke);
    const candidates = Array.from(document.querySelectorAll('button, [role=button]'))
      .filter(b => /send|ask|submit|enter/i.test(b.textContent || b.getAttribute('aria-label') || ''));
    if (candidates[0]) candidates[0].click();
  }

  const obs = new MutationObserver(() => ensureButton());
  obs.observe(document.documentElement, { childList: true, subtree: true });
  window.addEventListener('load', ensureButton);
  setTimeout(ensureButton, 1200);

  window.whisperDebugInput = () => {
    const found = findChatInput();
    console.log('whisperDebugInput →', found?.el, 'in doc', found?.doc?.location?.href);
    return found?.el || null;
  };
})();

✅ Usage

  • Click 🎤 Record to start dictating.
  • Click again to stop. Your spoken words are transcribed and inserted into the chat input.
  • Hold Shift while stopping to automatically send the message.

You can modify the ASR_URL at the top of the script to point to any Whisper-compatible transcription service. Use the console command window.whisperDebugInput() to verify which input field the script detects.