index.html

---
# Feel free to add content and custom Front Matter to this file.
# To modify the layout, see https://jekyllrb.com/docs/themes/#overriding-theme-defaults

layout: default
---

<style>
    .video-container {
        position: relative;
        padding-bottom: 56.25%; /* 16:9 aspect ratio */
        height: 0;
        overflow: hidden;
        max-width: 100%;
        background: #000;
    }
    .video-container iframe {
        position: absolute;
        top: 0;
        left: 0;
        width: 100%;
        height: 100%;
    }
    audio {
       width: 100%;
    }
    table {
        width: 100%;
        border-collapse: collapse;
    }
   .table-container {
        width: 100%;
        max-width: 100%;
        overflow-x: auto;
        -webkit-overflow-scrolling: touch; /* For smooth scrolling on mobile devices */
    }
    html, body {
        margin: 5px; /* Get rid of default margins */
        padding: 5px; /* Get rid of default padding */
        overflow-x: hidden; /* Prevent horizontal overflow */
    }
    th, td {
        padding: 4px;
        text-align: left;
        border: 1px solid #ddd;
        font-size: 1em; /* Base font size */
        white-space: nowrap;
    }
    p {
        text-align: justify;
        hyphens: auto;
    }   
    div {
        margin: 0;
        padding: 0;
        box-sizing: border-box;
        max-width: 100%;
    }
    /* Define a class for left-aligned text */
    .left-align {
        text-align: left;
        hyphens: none;
    } 
    .container {
        max-width: 100%;
        background-color: #f4f4f4;
        align-items: left;
        overflow-x: auto; /* Handle overflow */
        padding: 0;
        margin: 0 auto;
        position: relative; 
    }
    .inner-container {
        width: calc(100% - 30px);
        background-color: #f4f4f4;
        align-items: left;
        padding: 0;
        margin: 0 auto;
        overflow: hidden;
        position: relative; /* Relative positioning for absolute child positioning */
    }
    pre {
        width: calc(100% - 30px);
        background-color: #f4f4f4;
        border: 0;
        padding: 0px;
        overflow: auto;
        border-radius: 0px;
        margin: 0;
    }
    code {
        font-family: monospace;
        padding: 0px;
        margin: 0;
    }
    .highlight {
        cursor: pointer;
        padding: 4px 4px;
        background-color: #f4f4f4;
        border: 0;
        color: white;
        border-radius: 4px;
        display: flex;
        align-items: center;
        max-width: 100%;
        position: absolute;
        top: 0; /* Position the button at the top */
        right: 0; /* Position the button at the right edge */
        margin: 5px; /* Small margin for aesthetics */
    }
    
    .highlight svg {
        fill: #4e4e4e;
        margin-right: 5px;
    }

    .highlight:hover {
        background-color: #d3d3d3;
    }
</style>

<h1>
    Video Examples
</h1>

<p>
    Here we provide video examples of our method compared to other methods. 
</p>

<h2>
    LRS3 Test Set
</h2>

<p>Select a video file: &nbsp;&nbsp;
    <select id="videoSelect" onchange="playAudio()">
        <option value="SWiwSBLXS3k/00002">SWiwSBLXS3k/00002</option>
        <option value="J8FyHI00ELY/00004">J8FyHI00ELY/00004</option>
        <option value="vXPJVwwEmiM/00008">vXPJVwwEmiM/00008</option>
        <option value="9JKy6ZfmBn0/00002">9JKy6ZfmBn0/00002</option>
        <option value="sxnlvwprfSc/00011">sxnlvwprfSc/00011</option>
        <option value="8nt3edWLgIg/00004">8nt3edWLgIg/00004</option>
        <option value="1bnzVjOJ6NM/00011">1bnzVjOJ6NM/00011</option>
        <option value="M7KSnq0xkWY/00001">M7KSnq0xkWY/00001</option>
    </select>
</p>


<table>
    <tr>
        <th>Original Video</th>
        <th>Enhanced Video</th>
    </tr>
    <tr>
        <td>
            <video id="videoPlayerOrig" controls>
                <source id="videoSourceOrig" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/orig/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
        <td>
            <video id="videoPlayerEnhanced" controls>
                <source id="videoSourceEnhanced" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/enhanced/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
    </tr>
    <tr>
        <th>IL2S</th>
        <th>IL2S (AV-HuBERT)</th>
    </tr>
    <tr>
        <td>
            <video id="videoPlayerIL2S" controls>
                <source id="videoSourceIL2S" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/il2s/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
        <td>
            <video id="videoPlayerIL2S_AVHuBERT" controls>
                <source id="videoSourceIL2S_AVHuBERT" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/il2s_avhubert/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
    </tr>
    <tr>
        <th>Lipvoicer</th>
        <th>DiffV2S</th>
    </tr>
    <tr>
        <td>
            <video id="videoPlayerLipvoicer" controls>
                <source id="videoSourceLipvoicer" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipvoicer/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
        <td>
            <video id="videoPlayerDiffV2S" controls>
                <source id="videoSourceDiffV2S" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/diffv2s/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
    </tr>
    <tr>
        <th>LipDiffuser (run11/7340032)</th>
        <th>LipDiffuser (run11/7340032_aligned)</th>
    </tr>
    <tr>
        <td>
            <video id="videoPlayerLipDiffuserRun11_7340032" controls>
                <source id="videoSourceLipDiffuserRun11_7340032" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run11/7340032/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
        <td>
            <video id="videoPlayerLipDiffuserRun11_0007340_aligned" controls>
                <source id="videoSourceLipDiffuserRun11_0007340_aligned" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run11/0007340_aligned/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
    </tr>
    <tr>
        <th>LipDiffuser (run12/2097152)</th>
        <th></th>
    </tr>
    <tr>
        <td>
            <video id="videoPlayerLipDiffuserRun12_2097152" controls>
                <source id="videoSourceLipDiffuserRun12_2097152" src="https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run12/2097152/SWiwSBLXS3k/00002.mp4" type="video/mp4">
                Your browser does not support the video element.
            </video>
        </td>
        <td></td>
    </tr>
</table>

<script>
    function playAudio() {
        var videoPlayerOrig = document.getElementById('videoPlayerOrig');
        var videoPlayerIL2S_AVHuBERT = document.getElementById('videoPlayerIL2S_AVHuBERT');
        var videoPlayerIL2S = document.getElementById('videoPlayerIL2S');
        var videoPlayerEnhanced = document.getElementById('videoPlayerEnhanced');
        var videoPlayerLipvoicer = document.getElementById('videoPlayerLipvoicer');
        var videoPlayerDiffV2S = document.getElementById('videoPlayerDiffV2S');
        var videoSourceOrig = document.getElementById('videoSourceOrig');
        var videoSourceIL2S_AVHuBERT = document.getElementById('videoSourceIL2S_AVHuBERT');
        var videoSourceIL2S = document.getElementById('videoSourceIL2S');
        var videoSourceEnhanced = document.getElementById('videoSourceEnhanced');
        var videoSourceLipvoicer = document.getElementById('videoSourceLipvoicer');
        var videoSourceDiffV2S = document.getElementById('videoSourceDiffV2S');

        var videoPlayerLipDiffuserRun11_7340032 = document.getElementById('videoPlayerLipDiffuserRun11_7340032');
        var videoSourceLipDiffuserRun11_7340032 = document.getElementById('videoSourceLipDiffuserRun11_7340032');

        var videoPlayerLipDiffuserRun11_0007340_aligned = document.getElementById('videoPlayerLipDiffuserRun11_0007340_aligned');
        var videoSourceLipDiffuserRun11_0007340_aligned = document.getElementById('videoSourceLipDiffuserRun11_0007340_aligned');

        var videoPlayerLipDiffuserRun12_2097152 = document.getElementById('videoPlayerLipDiffuserRun12_2097152');
        var videoSourceLipDiffuserRun12_2097152 = document.getElementById('videoSourceLipDiffuserRun12_2097152');

        var selectedAudio = document.getElementById('videoSelect').value;

        if (selectedAudio) {
            // Set the source of the source elements, not the audio players
            videoSourceOrig.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/orig/" + selectedAudio + ".mp4";
            videoPlayerOrig.load(); // Load the selected audio file using the audio player

            videoSourceIL2S_AVHuBERT.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/il2s_avhubert/" + selectedAudio + ".mp4";
            videoPlayerIL2S_AVHuBERT.load(); // Load the selected audio file using the audio player

            videoSourceIL2S.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/il2s/" + selectedAudio + ".mp4";
            videoPlayerIL2S.load(); // Load the selected audio file using the audio player

            videoSourceEnhanced.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/enhanced/" + selectedAudio + ".mp4";
            videoPlayerEnhanced.load(); // Load the selected audio file using the audio player

            videoSourceLipvoicer.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipvoicer/" + selectedAudio + ".mp4";
            videoPlayerLipvoicer.load(); // Load the selected audio file using the audio player

            videoSourceDiffV2S.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/diffv2s/" + selectedAudio + ".mp4";
            videoPlayerDiffV2S.load(); // Load the selected audio file using the audio player

            videoSourceLipDiffuserRun11_7340032.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run11/7340032/" + selectedAudio + ".mp4";
            videoPlayerLipDiffuserRun11_7340032.load(); // Load the selected audio file using the audio player

            videoSourceLipDiffuserRun11_0007340_aligned.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run11/0007340_aligned/" + selectedAudio + ".mp4";
            videoPlayerLipDiffuserRun11_0007340_aligned.load(); // Load the selected audio file using the audio player

            videoSourceLipDiffuserRun12_2097152.src = "https://www2.informatik.uni-hamburg.de/sp/audio/publications/lip2speech/files/lipdiffuser/run12/2097152/" + selectedAudio + ".mp4";
            videoPlayerLipDiffuserRun12_2097152.load(); // Load the selected audio file using the audio player

            // This will ensure that the audio players are updating their children source elements
            // and reloading them
        }
    }
</script>
<br>

<h2 id="citation">
    Citation
</h2>

<div class="container">
    <div class="inner-container" dir="auto" data-snippet-clipboard-copy-content="{% raw %}@inproceedings{oliveira2025lipdiffuser,
    title={{LipDiffuser:} Lip-to-Speech Generation with Conditional Diffusion Models},
    author={de Oliveira, Danilo and Richter, Julius and Peer, Tal and Gerkmann, Timo},
    booktitle={Not determined yet},
    year={2025}
}{% endraw %}">
<pre><code>{% raw %}@inproceedings{oliveira2025lipdiffuser,
    title={{LipDiffuser:} Lip-to-Speech Generation with Conditional Diffusion Models},
    author={de Oliveira, Danilo and Richter, Julius and Peer, Tal and Gerkmann, Timo},
    booktitle={Not determined yet},
    year={2025}
}{% endraw %}</code></pre>
    </div>
    <button id="copyButton" class="highlight">
        <svg xmlns="http://www.w3.org/2000/svg" height="16" viewBox="0 0 16 16" width="16" class="octicon octicon-clippy">
            <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path>
            <path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
        </svg>
    </button>
</div>

<script>
    document.getElementById('copyButton').addEventListener('click', function() {
        const codeContent = document.querySelector('div[data-snippet-clipboard-copy-content]').getAttribute('data-snippet-clipboard-copy-content');
        navigator.clipboard.writeText(codeContent).then(function() {
        }, function() {
            alert('Failed to copy!');
        });
    });
</script>

<br>

<h2>
    References
</h2>

<ol id="refList" style="list-style-type: none; padding: 0;">
    <!-- JavaScript will populate this list -->
</ol>

<script>
    document.addEventListener("DOMContentLoaded", function () {
        const references = {
            "convtasnet": "Y. Luo and N. Mesgarani, “Conv-TasNet: Surpassing ideal time–frequency magnitude masking for speech separation,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 27, no. 8, pp. 1256–1266, 2019.",
            "cdiffuse": "Y.-J. Lu, Z.-Q. Wang, S. Watanabe, A. Richard, C. Yu, and Y. Tsao, “Conditional diffusion probabilistic model for speech enhancement,” in IEEE International Conference on Acoustics, Speech and Signal Processing, 2022, pp. 7402–7406.",
            "demucs": "S. Rouard, F. Massa, and A. Défossez, “Hybrid transformers for music source separation,” in IEEE International Conference on Acoustics, Speech and Signal Processing, 2023.",
            "sgmse": "J. Richter, S. Welker, J.-M. Lemercier, B. Lay, and T. Gerkmann, “Speech enhancement and dereverberation with diffusion-based generative models,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, pp. 2351–2364, 2023.",
            "diffv2s": "Jeongsoo Choi, Joanna Hong, and Yong Man Ro. Diffv2s: Diffusion-based video-to-speech synthesis with vision-guided speaker embedding. In Proceedings of the IEEE/CVF International Conference on Computer Vision, pages 7812–7821, 2023.",
            "lipvoicer": "Yochai Yemini, Aviv Shamsian, Lior Bracha, Sharon Gannot, and Ethan Fetaya. LipVoicer: Generating speech from silent videos guided by lip reading. In International Conference on Learning Representations, 2024.",
            "il2s": "Jeongsoo Choi, Minsu Kim, and Yong Man Ro. Intelligible lip-to-speech synthesis with speech units. In ISCA Interspeech, pages 4349–4353, 2023.",
        };

        // Collect keys in order of their appearance, without duplication
        const referenceElements = document.querySelectorAll('.reference');
        const orderedKeys = [];
        referenceElements.forEach(element => {
            // Split the keys and iterate over them
            const keys = element.dataset.ref.split(' ');
            keys.forEach(key => {
                // Add key if it's not already in the list to maintain first appearance order
                if (!orderedKeys.includes(key) && references[key]) {
                    orderedKeys.push(key);
                }
            });
        });
        let sortedReferences = {};
        // Populate the sorted references object based on orderedKeys
        orderedKeys.forEach(key => {
            if (references[key]) {
                sortedReferences[key] = references[key];
            }
        });

        function changeId(element) {
            // Change the id of the clicked element to the next one
            key = element.id.split('-')[1];
            elem = document.getElementById(`link-${key}`);
            elem.href = `#${element.id}`;
        }

        const refList = document.getElementById('refList');
        const refTotal = Object.keys(sortedReferences).length;
        const maxWidth = `${Math.ceil(Math.log10(refTotal + 1)) * 10 + 5}px`;

        Object.entries(sortedReferences).forEach(([key, ref], index) => {
            const refNumber = index + 1;
            const li = document.createElement('li');
            li.id = `ref-${refNumber}`;
            li.style.display = 'flex';
            li.style.alignItems = 'baseline';
            li.style.marginBottom = '5px';

            const numberSpan = document.createElement('span');
            numberSpan.style.fontWeight = 'regular';
            numberSpan.style.minWidth = maxWidth;
            numberSpan.style.textAlign = 'right';
            numberSpan.style.marginRight = '10px';
            numberSpan.style.whiteSpace = 'nowrap';

            const numberLink = document.createElement('a');
            numberLink.href = `#cite-${key}-0`; 
            numberLink.id = `link-${key}`;
            numberLink.textContent = `[${refNumber}]`;
            numberLink.style.textDecoration = 'none';
            numberSpan.appendChild(numberLink);

            const textSpan = document.createElement('span');
            textSpan.textContent = ref;

            li.appendChild(numberSpan);
            li.appendChild(textSpan);
            refList.appendChild(li);
        });

        referenceCounter = {};
        // Making In-text references linkable and combine multiple references
        document.querySelectorAll('.reference').forEach(span => {
            const keys = span.getAttribute('data-ref').split(' ');
            const refNumbers = keys.map(key => {
                const index = Object.keys(sortedReferences).indexOf(key);
                return index + 1;
            });
            keys.forEach((key, i) => {
                if (referenceCounter[key] >= 0) {
                    referenceCounter[key]++;
                } else {
                    referenceCounter[key] = 0;
                }

                // Set up string if there are multiple references
                if (keys.length > 1) {
                    if (i === 0) {
                        refText = `[${refNumbers[i]}, `;
                    } else if (i === keys.length - 1) {
                        refText = `${refNumbers[i]}]`;
                    } else {
                        refText = `${refNumbers[i]}, `;
                    }
                // Set up string if there is only one reference
                } else {
                    refText = `[${refNumbers[i]}]`;
                }

                const citeId = `cite-${key}-${referenceCounter[key]}`;
                const citeLink = document.createElement('a');
                citeLink.href = `#ref-${refNumbers[i]}`;
                citeLink.textContent = refText;
                citeLink.id = citeId;
                citeLink.style.textDecoration = 'none';
                citeLink.addEventListener('click', function () {
                    changeId(this);
                });

                span.appendChild(citeLink);
            });
        });

    });
</script>