AnythingLLM 🎤 Voice Recording (Local Whisper Integration)

🎤 AnythingLLM – Local Whisper Recording Integration

Add a Record button to AnythingLLM that lets you dictate messages using your microphone. The audio is sent to your own local Whisper server for transcription — no OpenAI API required.

🧩 Requirements

A running Whisper ASR endpoint (example: https://whisper.parsons.familyds.net/asr)
A supported desktop browser: Chrome, Firefox, Edge, or Brave
The Tampermonkey browser extension installed

⚙️ Installation Steps

Install the Tampermonkey extension for your browser.
Click the Tampermonkey icon → Create a new script…
Delete any existing placeholder code.
Paste the full script below.
Save the script (File → Save or Ctrl+S).
Visit your AnythingLLM site (or refresh it). You should see a 🎤 Record button next to the chat input.
Click once to record, click again to stop — your transcribed text appears automatically. Hold Shift while stopping to auto-send.

🧾 Full Script — AnythingLLM 🎤 Record (v1.3)

// ==UserScript==
// @name         AnythingLLM – 🎤 Record (local Whisper)
// @namespace    https://parsons.family
// @version      1.3
// @description  Adds a mic record button to AnythingLLM; transcribes via local Whisper; no OpenAI.
// @match        https://anythingllm.parsons.familyds.net/*
// @match        http://localhost:3001/*
// @match        http://192.168.1.200:3001/*
// @match        http://ai-server:3001/*
// @grant        none
// @run-at       document-idle
// ==/UserScript==

(function () {
  'use strict';

  const ASR_URL = 'https://whisper.parsons.familyds.net/asr';

  const textareaSetter =
    Object.getOwnPropertyDescriptor(HTMLTextAreaElement.prototype, 'value')?.set;
  const inputSetter =
    Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value')?.set;

  function setReactValue(el, val) {
    if (el instanceof HTMLTextAreaElement && textareaSetter) textareaSetter.call(el, val);
    else if (el instanceof HTMLInputElement && inputSetter) inputSetter.call(el, val);
    else if (el.isContentEditable) {}
    else el.value = val;

    try {
      el.dispatchEvent(new InputEvent('beforeinput', {
        inputType: 'insertText', data: val, bubbles: true, cancelable: true, composed: true
      }));
    } catch {}
    el.dispatchEvent(new Event('input', { bubbles: true, cancelable: true, composed: true }));
    el.dispatchEvent(new Event('change', { bubbles: true }));
  }

  function getValue(el) {
    if (el instanceof HTMLTextAreaElement || el instanceof HTMLInputElement) return el.value || '';
    if (el.isContentEditable) return el.innerText || '';
    return '';
  }

  function insertIntoContentEditable(el, text) {
    el.focus();
    const sel = el.ownerDocument.getSelection();
    if (!sel || sel.rangeCount === 0) {
      el.appendChild(el.ownerDocument.createTextNode(text));
    } else {
      const range = sel.getRangeAt(0);
      range.deleteContents();
      range.insertNode(el.ownerDocument.createTextNode(text));
      range.collapse(false);
      sel.removeAllRanges();
      sel.addRange(range);
    }
    el.dispatchEvent(new InputEvent('beforeinput', {
      inputType: 'insertText', data: text, bubbles: true, cancelable: true, composed: true
    }));
    el.dispatchEvent(new Event('input', { bubbles: true }));
    el.dispatchEvent(new Event('change', { bubbles: true }));
  }

  function isVisible(el) {
    const s = getComputedStyle(el);
    const r = el.getBoundingClientRect();
    return s.display !== 'none' && s.visibility !== 'hidden' && r.width > 200 && r.height > 30;
  }
  function byArea(a, b) {
    const ra = a.getBoundingClientRect(); const rb = b.getBoundingClientRect();
    return (rb.width * rb.height) - (ra.width * ra.height);
  }

  function queryAllCandidates(rootDoc) {
    const selectors = [
      'textarea[placeholder*="message" i]',
      'textarea[aria-label*="message" i]',
      'textarea[aria-multiline="true"]',
      'textarea',
      '[role="textbox"][contenteditable="true"]',
      '.ProseMirror[contenteditable="true"]',
      '.ql-editor[contenteditable="true"]'
    ];
    let out = [];
    selectors.forEach(sel => out.push(...rootDoc.querySelectorAll(sel)));
    return out.filter(isVisible).sort(byArea);
  }

  function findChatInput() {
    const topCands = queryAllCandidates(document);
    if (topCands.length) return { el: topCands[0], doc: document };
    for (const frame of Array.from(document.querySelectorAll('iframe'))) {
      try {
        const d = frame.contentDocument;
        if (!d) continue;
        const cands = queryAllCandidates(d);
        if (cands.length) return { el: cands[0], doc: d };
      } catch {}
    }
    return null;
  }

  function ensureButton() {
    if (document.getElementById('whisper-record-btn')) return;
    const found = findChatInput();
    if (!found) return;
    const { el } = found;

    const btn = document.createElement('button');
    btn.id = 'whisper-record-btn';
    btn.type = 'button';
    btn.textContent = '🎤 Record';
    Object.assign(btn.style, {
      marginLeft: '8px',
      padding: '6px 10px',
      borderRadius: '10px',
      border: '1px solid #ccc',
      cursor: 'pointer',
      fontSize: '0.95rem'
    });

    el.insertAdjacentElement('afterend', btn);
    wireRecorder(btn, el);
    console.log('🎤 Record button injected next to', el);
  }

  function wireRecorder(btn, inputEl) {
    let rec = null;
    let chunks = [];
    let stoppingEvent = null;

    btn.addEventListener('click', async (ev) => {
      if (!rec) {
        try {
          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
          chunks = [];
          rec = new MediaRecorder(stream, { mimeType: 'audio/webm' });
          rec.ondataavailable = e => e.data && e.data.size && chunks.push(e.data);
          rec.onstop = () => doTranscribe(btn, inputEl, chunks, stoppingEvent).finally(() => {
            rec.stream.getTracks().forEach(t => t.stop());
            rec = null; chunks = []; stoppingEvent = null;
            btn.disabled = false; btn.textContent = '🎤 Record';
          });
          rec.start();
          btn.textContent = '⏹ Stop (Shift=auto-send)';
        } catch (e) {
          console.error(e);
          alert('Mic permission denied or unavailable.');
        }
      } else {
        stoppingEvent = ev;
        btn.disabled = true;
        btn.textContent = '…processing';
        rec.stop();
      }
    });
  }

  async function doTranscribe(btn, inputFallback, chunks, ev) {
    try {
      const blob = new Blob(chunks, { type: 'audio/webm' });

      const isAsr = /\/asr(\?|$)/.test(ASR_URL);
      const form = new FormData();
      let url = ASR_URL;

      if (isAsr) {
        url += (ASR_URL.includes('?') ? '&' : '?') + 'task=transcribe&language=en&output=json';
        form.append('audio_file', blob, 'recording.webm');
      } else {
        if (!/\/transcribe(\?|$)/.test(ASR_URL)) url = ASR_URL.replace(/\/$/, '') + '/transcribe';
        form.append('file', blob, 'recording.webm');
      }

      const res = await fetch(url, { method: 'POST', body: form });
      if (!res.ok) throw new Error(`ASR ${res.status} ${res.statusText}`);

      const raw = await res.text();
      let parsed; try { parsed = JSON.parse(raw); } catch { parsed = { text: raw }; }

      let text = '';
      if (typeof parsed === 'string') text = parsed;
      else if (parsed && typeof parsed.text === 'string') text = parsed.text;
      else if (parsed && typeof parsed.transcription === 'string') text = parsed.transcription;
      else if (Array.isArray(parsed) && parsed[0]?.text) text = parsed[0].text;
      text = (text || '').trim();

      const found = findChatInput();
      const target = (found && found.el) || inputFallback || document.activeElement;

      if (target && text) {
        const prefix = (getValue(target).endsWith(' ') || getValue(target) === '' ? '' : ' ');
        insertAtCursor(target, prefix + text);
        if (ev && ev.shiftKey) trySend(target);
      } else {
        if (navigator.clipboard && text) {
          await navigator.clipboard.writeText(text);
          alert('Transcript copied to clipboard (paste into AnythingLLM input).');
        } else {
          alert('Transcribed text:\n\n' + text);
        }
      }
    } catch (e) {
      console.error('ASR error:', e);
      alert('Transcription failed: ' + e.message + '\n(Check CORS and ASR_URL.)');
    }
  }

  function insertAtCursor(el, text) {
    el.focus();
    if (el instanceof HTMLTextAreaElement || el instanceof HTMLInputElement) {
      const start = el.selectionStart ?? getValue(el).length;
      const end = el.selectionEnd ?? getValue(el).length;
      const before = getValue(el).slice(0, start);
      const after = getValue(el).slice(end);
      setReactValue(el, before + text + after);
      const caret = start + text.length;
      el.selectionStart = el.selectionEnd = caret;
    } else if (el.isContentEditable) {
      insertIntoContentEditable(el, text);
    } else {
      setReactValue(el, (getValue(el) || '') + text);
    }
  }

  function trySend(input) {
    const ke = new KeyboardEvent('keydown', { bubbles: true, cancelable: true, key: 'Enter', code: 'Enter' });
    input.dispatchEvent(ke);
    const candidates = Array.from(document.querySelectorAll('button, [role=button]'))
      .filter(b => /send|ask|submit|enter/i.test(b.textContent || b.getAttribute('aria-label') || ''));
    if (candidates[0]) candidates[0].click();
  }

  const obs = new MutationObserver(() => ensureButton());
  obs.observe(document.documentElement, { childList: true, subtree: true });
  window.addEventListener('load', ensureButton);
  setTimeout(ensureButton, 1200);

  window.whisperDebugInput = () => {
    const found = findChatInput();
    console.log('whisperDebugInput →', found?.el, 'in doc', found?.doc?.location?.href);
    return found?.el || null;
  };
})();

✅ Usage

Click 🎤 Record to start dictating.
Click again to stop. Your spoken words are transcribed and inserted into the chat input.
Hold Shift while stopping to automatically send the message.

You can modify the ASR_URL at the top of the script to point to any Whisper-compatible transcription service. Use the console command window.whisperDebugInput() to verify which input field the script detects.