Multimodal Browser AI with Transformers.js for Photographs and Speech

<title>Multimodal Media Analyzer</title>

* { box-sizing: border-box; margin: 0; padding: 0; }

body {

font-family: system-ui, sans-serif;

max-width: 820px;

margin: 0 auto;

padding: 1.5rem 1rem;

background: #f1f5f9;

color: #1e293b;

}

header { margin-bottom: 1.5rem; }

header h1 { font-size: 1.5rem; }

header p { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

/* Model status indicators */

.model-status-bar {

display: flex;

gap: 0.5rem;

flex-wrap: wrap;

margin-top: 0.75rem;

}

.model-badge {

font-size: 0.78rem;

padding: 0.2rem 0.6rem;

border-radius: 12px;

background: #fef3c7;

color: #92400e;

}

.model-badge.ready { background: #dcfce7; color: #15803d; }

/* Tab bar */

.tabs {

display: flex;

background: white;

border-radius: 8px;

padding: 0.25rem;

gap: 0.25rem;

margin-bottom: 1.25rem;

border: 1px solid #e2e8f0;

}

.tab {

flex: 1;

padding: 0.5rem;

text-align: center;

border-radius: 6px;

cursor: pointer;

font-size: 0.9rem;

color: #64748b;

transition: all 0.15s;

}

.tab.active { background: #2563eb; color: white; font-weight: 600; }

/* Input panels */

.panel { display: none; }

.panel.active { display: block; }

.upload-area {

background: white;

border: 2px dashed #cbd5e1;

border-radius: 8px;

padding: 2rem;

text-align: center;

cursor: pointer;

}

.upload-area input { display: none; }

#img-preview {

margin-top: 1rem;

max-width: 100%;

max-height: 320px;

border-radius: 8px;

display: none;

object-fit: cover;

}

.mic-center { text-align: center; padding: 1rem 0; }

#rec-btn {

width: 72px; height: 72px;

border-radius: 50%; border: none;

background: #dc2626; color: white;

font-size: 1.6rem; cursor: pointer;

display: flex; align-items: center; justify-content: center;

margin: 0 auto 0.5rem;

}

#rec-btn.recording { background: #374151; }

#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }

#rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }

#rec-hint { font-size: 0.85rem; color: #64748b; }

#wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }

/* Results grid */

.results-grid {

display: grid;

grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));

gap: 1rem;

margin-top: 1.25rem;

}

.result-card {

background: white;

border: 1px solid #e2e8f0;

border-radius: 8px;

padding: 1rem;

}

.result-card h3 {

font-size: 0.75rem;

text-transform: uppercase;

letter-spacing: 0.06em;

color: #64748b;

margin-bottom: 0.6rem;

}

.label-item {

display: flex;

justify-content: space-between;

align-items: center;

padding: 0.25rem 0;

font-size: 0.875rem;

border-bottom: 1px solid #f1f5f9;

}

.label-score {

font-size: 0.8rem;

color: #64748b;

background: #f1f5f9;

padding: 0.1rem 0.4rem;

border-radius: 4px;

}

.caption-body {

font-size: 0.95rem;

line-height: 1.5;

font-style: italic;

color: #334155;

}

.transcript-body {

font-size: 0.95rem;

line-height: 1.6;

color: #334155;

white-space: pre-wrap;

}

.placeholder-text { color: #94a3b8; font-style: italic; font-size: 0.9rem; }

#global-status {

font-size: 0.85rem;

color: #64748b;

margin-bottom: 1rem;

}

@media (max-width: 500px) {

.results-grid { grid-template-columns: 1fr; }

}

<h1>Multimodal Media Analyzer</h1>

<p>Image classification, captioning, and speech transcription — all in your browser.</p>

<span class=”model-badge” id=”badge-cls”>Classifier: loading…</span>

<span class=”model-badge” id=”badge-cap”>Captioner: loading…</span>

<span class=”model-badge” id=”badge-asr”>Whisper: loading…</span>

</div>

</header>

<div id=”global-status”>Loading models in parallel — first run downloads ~400 MB total.</div>

<div class=”tab active” data-tab=”image”>🖼 Image Analysis</div>

<div class=”tab” data-tab=”speech”>🎙 Speech Transcription</div>

</div>

<!– Image panel –>

<p>Click or drag an image to analyze</p>

JPG, PNG, WebP, GIF supported

</p>

</div>

</div>

<!– Speech panel –>

<div id=”rec-hint”>Waiting for Whisper model…</div>

</div>

<!– Results – shown for both modes –>

<!– Image results (shown in image mode) –>

<h3>Classification</h3>

<p class=”placeholder-text”>No results yet.</p>

</div>

<h3>Caption</h3>

<p class=”placeholder-text”>No caption yet.</p>

</div>

<!– Speech results (shown in speech mode) –>

<h3>Transcription</h3>

<p class=”placeholder-text”>Record audio to see the transcription.</p>

</div>

import { pipeline }

from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;

// ── Pipeline references ───────────────────────────────────────────────

let classifier, captioner, transcriber;

let readyCount = 0;

// Update a model badge to “ready” state

function markReady(badgeId, label) {

const badge = document.getElementById(badgeId);

badge.textContent = `${label}: ready`;

badge.classList.add(‘ready’);

readyCount++;

if (readyCount === 3) {

globalStatus.textContent =

‘All models ready. Upload an image or record audio.’;

recBtn.disabled = false;

recHint.textContent = ‘Click to start recording.’;

}

// Load all three pipelines simultaneously

Promise.all((

pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

}),

pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

}),

pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

})

)).then(((cls, cap, asr)) => {

classifier = cls;

captioner = cap;

transcriber = asr;

}).catch(err => {

globalStatus.textContent = `Error loading models: ${err.message}`;

});

// ── UI references ─────────────────────────────────────────────────────

const globalStatus = document.getElementById(‘global-status’);

const resultsGrid = document.getElementById(‘results-grid’);

const recBtn = document.getElementById(‘rec-btn’);

const recHint = document.getElementById(‘rec-hint’);

const recTimer = document.getElementById(‘rec-timer’);

const waveCanvas = document.getElementById(‘wave-canvas’);

const waveCtx = waveCanvas.getContext(‘2d’);

// ── Image analysis ────────────────────────────────────────────────────

async function analyzeImage(dataUrl) {

if (!classifier || !captioner) {

globalStatus.textContent = ‘Models still loading. Please wait.’;

return;

}

globalStatus.textContent = ‘Running classification and captioning…’;

// Show image result cards, hide speech card

document.getElementById(‘card-cls’).style.display = ‘block’;

document.getElementById(‘card-cap’).style.display = ‘block’;

document.getElementById(‘card-asr’).style.display = ‘none’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘cls-content’).innerHTML =

‘<p class=”placeholder-text”>Classifying…</p>’;

document.getElementById(‘cap-content’).innerHTML =

‘<p class=”placeholder-text”>Generating caption…</p>’;

try {

// Run classification and captioning in parallel

const (classResults, captionResults) = await Promise.all((

classifier(dataUrl, { top_k: 4 }),

captioner(dataUrl, { max_new_tokens: 60 })

));

// Render classification labels

document.getElementById(‘cls-content’).innerHTML =

classResults.map(({ label, score }) => `

<span>${label}</span>

<span class=”label-score”>${(score * 100).toFixed(1)}%</span>

</div>`).join(”);

// Render generated caption

document.getElementById(‘cap-content’).innerHTML =

`<p class=”caption-body”>”${captionResults(0)?.generated_text ?? ‘No caption.’}”</p>`;

globalStatus.textContent = ‘Analysis complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// File upload handler for images

const imgDrop = document.getElementById(‘img-drop’);

const imgInput = document.getElementById(‘img-input’);

const imgPrev = document.getElementById(‘img-preview’);

function handleImageFile(file) {

if (!file?.type.startsWith(‘image/’)) return;

const reader = new FileReader();

reader.onload = e => {

imgPrev.src = e.target.result;

imgPrev.style.display = ‘block’;

analyzeImage(e.target.result);

};

reader.readAsDataURL(file);

}

imgDrop.addEventListener(‘click’, () => imgInput.click());

imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files(0)));

imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

imgDrop.addEventListener(‘drop’, e => {

e.preventDefault();

handleImageFile(e.dataTransfer.files(0));

});

// ── Audio decoding helper ─────────────────────────────────────────────

async function decodeAudio(arrayBuffer) {

const audioCtx = new AudioContext({ sampleRate: 16000 });

const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz

}

// ── Speech transcription ──────────────────────────────────────────────

async function runTranscription(audioData) {

// Show speech result card, hide image cards

document.getElementById(‘card-cls’).style.display = ‘none’;

document.getElementById(‘card-cap’).style.display = ‘none’;

document.getElementById(‘card-asr’).style.display = ‘block’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘asr-content’).innerHTML =

‘<p class=”placeholder-text”>Transcribing…</p>’;

globalStatus.textContent = ‘Running Whisper transcription…’;

try {

const result = await transcriber(audioData, {

chunk_length_s: 30,

stride_length_s: 5

});

document.getElementById(‘asr-content’).innerHTML =

`<p class=”transcript-body”>${result.text.trim()}</p>`;

globalStatus.textContent = ‘Transcription complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// ── Microphone recording ──────────────────────────────────────────────

let mediaRecorder, audioChunks = (), timerInterval, analyserNode, animId;

let secs = 0;

function drawWave() {

const buf = new Uint8Array(analyserNode.frequencyBinCount);

analyserNode.getByteTimeDomainData(buf);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

waveCtx.beginPath();

waveCtx.strokeStyle = ‘#2563eb’;

waveCtx.lineWidth = 1.5;

buf.forEach((v, i) => {

const x = (i / buf.length) * waveCanvas.width;

const y = (v / 128.0) * (waveCanvas.height / 2);

i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

});

waveCtx.stroke();

animId = requestAnimationFrame(drawWave);

}

recBtn.addEventListener(‘click’, async () => {

if (mediaRecorder?.state === ‘recording’) {

mediaRecorder.stop();

recBtn.classList.remove(‘recording’);

recBtn.textContent = ‘🎙’;

clearInterval(timerInterval);

cancelAnimationFrame(animId);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

recHint.textContent = ‘Processing…’;

} else {

try {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const actx = new AudioContext();

analyserNode = actx.createAnalyser();

actx.createMediaStreamSource(stream).connect(analyserNode);

analyserNode.fftSize = 256;

mediaRecorder = new MediaRecorder(stream);

audioChunks = ();

mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);

mediaRecorder.onstop = async () => {

const blob = new Blob(audioChunks, { type: ‘audio/webm’ });

const arrayBuffer = await blob.arrayBuffer();

const audioData = await decodeAudio(arrayBuffer);

stream.getTracks().forEach(t => t.stop());

await runTranscription(audioData);

recHint.textContent = ‘Click to record again.’;

};

mediaRecorder.start();

recBtn.classList.add(‘recording’);

recBtn.textContent = ‘⏹’;

secs = 0;

recTimer.textContent = ‘0:00’;

timerInterval = setInterval(() => {

secs++;

recTimer.textContent =

`${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

}, 1000);

recHint.textContent = ‘Recording… click to stop.’;

drawWave();

} catch (err) {

recHint.textContent = `Mic error: ${err.message}`;

}

});

// ── Tab switching ─────────────────────────────────────────────────────

document.querySelectorAll(‘.tab’).forEach(tab => {

tab.addEventListener(‘click’, () => {

document.querySelectorAll(‘.tab, .panel’).forEach(el =>

el.classList.remove(‘active’));

tab.classList.add(‘active’);

document.getElementById(`panel-${tab.dataset.tab}`).classList.add(‘active’);

});

Source link

Multimodal Browser AI with Transformers.js for Photographs and Speech

MetaMask simply gave AI brokers a DeFi pockets with a leash

US Inflation Hits 3-Yr Excessive, Pressuring Bitcoin and Gold

Taylor Swift, Scooter Braun Almost Cross Paths at Knicks Finals Recreation in NYC

Apple’s iPadOS 27 beta downloads briefly included two unsupported iPad Professional fashions

First-Ever EDC Pageant in Colombia to Function deadmau5, Alesso, ILLENIUM and Extra: See the Full Lineup

Qualcomm teases ‘one thing new,’ and we’d see it at Meta Join

A basic mind check uncovered AI’s greatest weak spot

AI, jobs, and the following era

Startup’s nuclear-inspired cooling system might make knowledge facilities extra sustainable | MIT Information

BHP makes use of AI to seek out higher methods to extract copper | Microsoft Sign Weblog

The implications of counting on AI for correct information | MIT Information

Atos Group and Microsoft broaden strategic collaboration to scale safe agentic AI throughout Atos Group workforce and purchasers

Leave a ReplyCancel reply

5 Worst Performing ETFs of 2025 So Far

Samsung Galaxy S26 Extremely: what to anticipate

Offers: Google Pixel 9 and Samsung Galaxy S24 provides

6 Methods Employers Get You To Work For Free (Legally)

Is Warren Buffett’s Final Buy at Berkshire Hathaway a Prime Inventory Decide for 2026?

Xiaomi 15T Professional in for assessment

Ethereum (ETH) builders are exploring new token requirements as privateness returns to focus

Anthropic Walks Again Coverage That Might Have ‘Sabotaged’ AI Researchers Utilizing Claude

MetaMask simply gave AI brokers a DeFi pockets with a leash

US Inflation Hits 3-Yr Excessive, Pressuring Bitcoin and Gold

Taylor Swift, Scooter Braun Almost Cross Paths at Knicks Finals Recreation in NYC

Apple’s iPadOS 27 beta downloads briefly included two unsupported iPad Professional fashions

First-Ever EDC Pageant in Colombia to Function deadmau5, Alesso, ILLENIUM and Extra: See the Full Lineup

Qualcomm teases ‘one thing new,’ and we’d see it at Meta Join

Leave a ReplyCancel reply

Log In

Sign In

Forgot password?

Your password reset link appears to be invalid or expired.

Log in

Privacy Policy

Add to Collection

No Collections