in

Multimodal Browser AI with Transformers.js for Photographs and Speech


<title>Multimodal Media Analyzer</title>

  

    * { box-sizing: border-box; margin: 0; padding: 0; }

    body {

      font-family: system-ui, sans-serif;

      max-width: 820px;

      margin: 0 auto;

      padding: 1.5rem 1rem;

      background: #f1f5f9;

      color: #1e293b;

    }

 

    header { margin-bottom: 1.5rem; }

    header h1 { font-size: 1.5rem; }

    header p  { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

 

    /* Model status indicators */

    .model-status-bar {

      display: flex;

      gap: 0.5rem;

      flex-wrap: wrap;

      margin-top: 0.75rem;

    }

    .model-badge {

      font-size: 0.78rem;

      padding: 0.2rem 0.6rem;

      border-radius: 12px;

      background: #fef3c7;

      color: #92400e;

    }

    .model-badge.ready { background: #dcfce7; color: #15803d; }

 

    /* Tab bar */

    .tabs {

      display: flex;

      background: white;

      border-radius: 8px;

      padding: 0.25rem;

      gap: 0.25rem;

      margin-bottom: 1.25rem;

      border: 1px solid #e2e8f0;

    }

    .tab {

      flex: 1;

      padding: 0.5rem;

      text-align: center;

      border-radius: 6px;

      cursor: pointer;

      font-size: 0.9rem;

      color: #64748b;

      transition: all 0.15s;

    }

    .tab.active { background: #2563eb; color: white; font-weight: 600; }

 

    /* Input panels */

    .panel { display: none; }

    .panel.active { display: block; }

 

    .upload-area {

      background: white;

      border: 2px dashed #cbd5e1;

      border-radius: 8px;

      padding: 2rem;

      text-align: center;

      cursor: pointer;

    }

    .upload-area input { display: none; }

 

    #img-preview {

      margin-top: 1rem;

      max-width: 100%;

      max-height: 320px;

      border-radius: 8px;

      display: none;

      object-fit: cover;

    }

 

    .mic-center { text-align: center; padding: 1rem 0; }

    #rec-btn {

      width: 72px; height: 72px;

      border-radius: 50%; border: none;

      background: #dc2626; color: white;

      font-size: 1.6rem; cursor: pointer;

      display: flex; align-items: center; justify-content: center;

      margin: 0 auto 0.5rem;

    }

    #rec-btn.recording { background: #374151; }

    #rec-btn:disabled  { background: #94a3b8; cursor: not-allowed; }

    #rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }

    #rec-hint  { font-size: 0.85rem; color: #64748b; }

    #wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }

 

    /* Results grid */

    .results-grid {

      display: grid;

      grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));

      gap: 1rem;

      margin-top: 1.25rem;

    }

    .result-card {

      background: white;

      border: 1px solid #e2e8f0;

      border-radius: 8px;

      padding: 1rem;

    }

    .result-card h3 {

      font-size: 0.75rem;

      text-transform: uppercase;

      letter-spacing: 0.06em;

      color: #64748b;

      margin-bottom: 0.6rem;

    }

    .label-item {

      display: flex;

      justify-content: space-between;

      align-items: center;

      padding: 0.25rem 0;

      font-size: 0.875rem;

      border-bottom: 1px solid #f1f5f9;

    }

    .label-score {

      font-size: 0.8rem;

      color: #64748b;

      background: #f1f5f9;

      padding: 0.1rem 0.4rem;

      border-radius: 4px;

    }

    .caption-body {

      font-size: 0.95rem;

      line-height: 1.5;

      font-style: italic;

      color: #334155;

    }

    .transcript-body {

      font-size: 0.95rem;

      line-height: 1.6;

      color: #334155;

      white-space: pre-wrap;

    }

    .placeholder-text { color: #94a3b8; font-style: italic; font-size: 0.9rem; }

    #global-status {

      font-size: 0.85rem;

      color: #64748b;

      margin-bottom: 1rem;

    }

 

    @media (max-width: 500px) {

      .results-grid { grid-template-columns: 1fr; }

    }

  

  <header>

    <h1>Multimodal Media Analyzer</h1>

    <p>Image classification, captioning, and speech transcription — all in your browser.</p>

    <div class=”model-status-bar”>

      <span class=”model-badge” id=”badge-cls”>Classifier: loading…</span>

      <span class=”model-badge” id=”badge-cap”>Captioner: loading…</span>

      <span class=”model-badge” id=”badge-asr”>Whisper: loading…</span>

    </div>

  </header>

 

  <div id=”global-status”>Loading models in parallel — first run downloads ~400 MB total.</div>

 

  <div class=”tabs”>

    <div class=”tab active” data-tab=”image”>🖼 Image Analysis</div>

    <div class=”tab” data-tab=”speech”>🎙 Speech Transcription</div>

  </div>

 

  <!– Image panel –>

  <div class=”panel active” id=”panel-image”>

    <div class=”upload-area” id=”img-drop”>

      

      <p>Click or drag an image to analyze</p>

      <p style=”font-size:0.8rem;color:#94a3b8;margin-top:0.3rem”>

        JPG, PNG, WebP, GIF supported

      </p>

    </div>

    <img id=”img-preview” alt=”Preview” />

  </div>

 

  <!– Speech panel –>

  <div class=”panel” id=”panel-speech”>

    <div class=”mic-center”>

      <button id=”rec-btn” disabled>🎙</button>

      <div id=”rec-timer”>0:00</div>

      <div id=”rec-hint”>Waiting for Whisper model…</div>

    </div>

    

  </div>

 

  <!– Results – shown for both modes –>

  <div class=”results-grid” id=”results-grid” style=”display:none”>

    <!– Image results (shown in image mode) –>

    <div class=”result-card” id=”card-cls” style=”display:none”>

      <h3>Classification</h3>

      <div id=”cls-content”>

        <p class=”placeholder-text”>No results yet.</p>

      </div>

    </div>

    <div class=”result-card” id=”card-cap” style=”display:none”>

      <h3>Caption</h3>

      <div id=”cap-content”>

        <p class=”placeholder-text”>No caption yet.</p>

      </div>

    </div>

    <!– Speech results (shown in speech mode) –>

    <div class=”result-card” id=”card-asr” style=”display:none”>

      <h3>Transcription</h3>

      <div id=”asr-content”>

        <p class=”placeholder-text”>Record audio to see the transcription.</p>

      </div>

    </div>

  </div>

 

  

    import { pipeline }

      from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;

 

    // ── Pipeline references ───────────────────────────────────────────────

    let classifier, captioner, transcriber;

    let readyCount = 0;

 

    // Update a model badge to “ready” state

    function markReady(badgeId, label) {

      const badge = document.getElementById(badgeId);

      badge.textContent = `${label}: ready`;

      badge.classList.add(‘ready’);

      readyCount++;

      if (readyCount === 3) {

        globalStatus.textContent =

          ‘All models ready. Upload an image or record audio.’;

        recBtn.disabled = false;

        recHint.textContent = ‘Click to start recording.’;

      }

    }

 

    // Load all three pipelines simultaneously

    Promise.all((

      pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

      }),

      pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

      }),

      pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

      })

    )).then(((cls, cap, asr)) => {

      classifier  = cls;

      captioner   = cap;

      transcriber = asr;

    }).catch(err => {

      globalStatus.textContent = `Error loading models: ${err.message}`;

    });

 

    // ── UI references ─────────────────────────────────────────────────────

    const globalStatus = document.getElementById(‘global-status’);

    const resultsGrid  = document.getElementById(‘results-grid’);

    const recBtn       = document.getElementById(‘rec-btn’);

    const recHint      = document.getElementById(‘rec-hint’);

    const recTimer     = document.getElementById(‘rec-timer’);

    const waveCanvas   = document.getElementById(‘wave-canvas’);

    const waveCtx      = waveCanvas.getContext(‘2d’);

 

    // ── Image analysis ────────────────────────────────────────────────────

    async function analyzeImage(dataUrl) {

      if (!classifier || !captioner) {

        globalStatus.textContent = ‘Models still loading. Please wait.’;

        return;

      }

 

      globalStatus.textContent = ‘Running classification and captioning…’;

 

      // Show image result cards, hide speech card

      document.getElementById(‘card-cls’).style.display = ‘block’;

      document.getElementById(‘card-cap’).style.display = ‘block’;

      document.getElementById(‘card-asr’).style.display = ‘none’;

      resultsGrid.style.display = ‘grid’;

 

      document.getElementById(‘cls-content’).innerHTML =

        ‘<p class=”placeholder-text”>Classifying…</p>’;

      document.getElementById(‘cap-content’).innerHTML =

        ‘<p class=”placeholder-text”>Generating caption…</p>’;

 

      try {

        // Run classification and captioning in parallel

        const (classResults, captionResults) = await Promise.all((

          classifier(dataUrl, { top_k: 4 }),

          captioner(dataUrl, { max_new_tokens: 60 })

        ));

 

        // Render classification labels

        document.getElementById(‘cls-content’).innerHTML =

          classResults.map(({ label, score }) => `

            <div class=”label-item”>

              <span>${label}</span>

              <span class=”label-score”>${(score * 100).toFixed(1)}%</span>

            </div>`).join(”);

 

        // Render generated caption

        document.getElementById(‘cap-content’).innerHTML =

          `<p class=”caption-body”>”${captionResults(0)?.generated_text ?? ‘No caption.’}”</p>`;

 

        globalStatus.textContent = ‘Analysis complete.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // File upload handler for images

    const imgDrop  = document.getElementById(‘img-drop’);

    const imgInput = document.getElementById(‘img-input’);

    const imgPrev  = document.getElementById(‘img-preview’);

 

    function handleImageFile(file) {

      if (!file?.type.startsWith(‘image/’)) return;

      const reader = new FileReader();

      reader.onload = e => {

        imgPrev.src = e.target.result;

        imgPrev.style.display = ‘block’;

        analyzeImage(e.target.result);

      };

      reader.readAsDataURL(file);

    }

 

    imgDrop.addEventListener(‘click’, () => imgInput.click());

    imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files(0)));

    imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

    imgDrop.addEventListener(‘drop’, e => {

      e.preventDefault();

      handleImageFile(e.dataTransfer.files(0));

    });

 

    // ── Audio decoding helper ─────────────────────────────────────────────

    async function decodeAudio(arrayBuffer) {

      const audioCtx    = new AudioContext({ sampleRate: 16000 });

      const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

      return audioBuffer.getChannelData(0);  // Mono Float32Array at 16kHz

    }

 

    // ── Speech transcription ──────────────────────────────────────────────

    async function runTranscription(audioData) {

      // Show speech result card, hide image cards

      document.getElementById(‘card-cls’).style.display = ‘none’;

      document.getElementById(‘card-cap’).style.display = ‘none’;

      document.getElementById(‘card-asr’).style.display = ‘block’;

      resultsGrid.style.display = ‘grid’;

 

      document.getElementById(‘asr-content’).innerHTML =

        ‘<p class=”placeholder-text”>Transcribing…</p>’;

 

      globalStatus.textContent = ‘Running Whisper transcription…’;

 

      try {

        const result = await transcriber(audioData, {

          chunk_length_s: 30,

          stride_length_s: 5

        });

        document.getElementById(‘asr-content’).innerHTML =

          `<p class=”transcript-body”>${result.text.trim()}</p>`;

        globalStatus.textContent = ‘Transcription complete.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // ── Microphone recording ──────────────────────────────────────────────

    let mediaRecorder, audioChunks = (), timerInterval, analyserNode, animId;

    let secs = 0;

 

    function drawWave() {

      const buf = new Uint8Array(analyserNode.frequencyBinCount);

      analyserNode.getByteTimeDomainData(buf);

      waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

      waveCtx.beginPath();

      waveCtx.strokeStyle = ‘#2563eb’;

      waveCtx.lineWidth = 1.5;

      buf.forEach((v, i) => {

        const x = (i / buf.length) * waveCanvas.width;

        const y = (v / 128.0) * (waveCanvas.height / 2);

        i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

      });

      waveCtx.stroke();

      animId = requestAnimationFrame(drawWave);

    }

 

    recBtn.addEventListener(‘click’, async () => {

      if (mediaRecorder?.state === ‘recording’) {

        mediaRecorder.stop();

        recBtn.classList.remove(‘recording’);

        recBtn.textContent = ‘🎙’;

        clearInterval(timerInterval);

        cancelAnimationFrame(animId);

        waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

        recHint.textContent = ‘Processing…’;

      } else {

        try {

          const stream  = await navigator.mediaDevices.getUserMedia({ audio: true });

          const actx    = new AudioContext();

          analyserNode  = actx.createAnalyser();

          actx.createMediaStreamSource(stream).connect(analyserNode);

          analyserNode.fftSize = 256;

 

          mediaRecorder = new MediaRecorder(stream);

          audioChunks   = ();

          mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);

          mediaRecorder.onstop = async () => {

            const blob        = new Blob(audioChunks, { type: ‘audio/webm’ });

            const arrayBuffer = await blob.arrayBuffer();

            const audioData   = await decodeAudio(arrayBuffer);

            stream.getTracks().forEach(t => t.stop());

            await runTranscription(audioData);

            recHint.textContent = ‘Click to record again.’;

          };

 

          mediaRecorder.start();

          recBtn.classList.add(‘recording’);

          recBtn.textContent = ‘⏹’;

          secs = 0;

          recTimer.textContent = ‘0:00’;

          timerInterval = setInterval(() => {

            secs++;

            recTimer.textContent =

              `${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

          }, 1000);

          recHint.textContent = ‘Recording… click to stop.’;

          drawWave();

        } catch (err) {

          recHint.textContent = `Mic error: ${err.message}`;

        }

      }

    });

 

    // ── Tab switching ─────────────────────────────────────────────────────

    document.querySelectorAll(‘.tab’).forEach(tab => {

      tab.addEventListener(‘click’, () => {

        document.querySelectorAll(‘.tab, .panel’).forEach(el =>

          el.classList.remove(‘active’));

        tab.classList.add(‘active’);

        document.getElementById(`panel-${tab.dataset.tab}`).classList.add(‘active’);

      });

    });



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *

GIPHY App Key not set. Please check settings

Ethereum (ETH) builders are exploring new token requirements as privateness returns to focus

Anthropic Walks Again Coverage That Might Have ‘Sabotaged’ AI Researchers Utilizing Claude