index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Spacewalk-18</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-2 publication-title">Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video Understanding</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://scholar.google.com/citations?user=koxiPYIAAAAJ&hl=en&oi=sra" target="_blank">Rohan Myer Krishnan</a><sup>*</sup>,</span>
                <span class="author-block">
                  <a href="https://zitiantang.github.io/" target="_blank">Zitian Tang</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="https://brown-palm.github.io/Spacewalk-18" target="_blank">Zhiqiu Yu</a>
                  </span>,
                  <span class="author-block">
                    <a href="https://chensun.me/" target="_blank">Chen Sun</a>
                  </span>
                  </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Brown University<!--<br>Conferance name and year--></span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                  </div>

                  <!-- <div class="column has-text-centered">
                    <div class="publication-links"> -->
                         <!-- Arxiv PDF link -->
                      <!-- <span class="link-block">
                        <a href="https://arxiv.org/pdf/<ARXIV PAPER ID>.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span> -->

                    <!-- Supplementary PDF link -->
                    <!-- <span class="link-block">
                      <a href="static/pdfs/supplementary_material.pdf" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                      </span>
                      <span>Supplementary</span>
                    </a>
                  </span> -->

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2311.18773" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/brown-palm/Spacewalk-18-benchmark" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Dataset</span>
                  </a>
                </span>


            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Teaser video-->
<!-- <section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop height="100%">

        <source src="static/videos/banner_video.mp4"
        type="video/mp4">
      </video>
      <h2 class="subtitle has-text-centered">
        Aliquam vitae elit ullamcorper tellus egestas pellentesque. Ut lacus tellus, maximus vel lectus at, placerat pretium mi. Maecenas dignissim tincidunt vestibulum. Sed consequat hendrerit nisl ut maximus. 
      </h2>
    </div>
  </div>
</section> -->
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Learning from videos is an emerging research area that enables robots to acquire skills from human demonstrations, such as procedural videos. To do this, video-language models must be able to obtain structured understandings, such as the temporal segmentation of a demonstration into sequences of actions and skills, and to generalize the understandings to novel domains. In pursuit of this goal, we introduce Spacewalk-18, a benchmark containing two tasks: (1) step recognition and (2) intra-video retrieval over a dataset of temporally segmented and labeled tasks in International Space Station spacewalk recordings. In tandem, the two tasks quantify a model's ability to make use of: (1) out-of-domain visual information; (2) a high temporal context window; and (3) multimodal (e.g. visual and speech) domains. This departs from existing benchmarks for procedural video understanding, which typically deal with short context lengths and can be solved with a single modality. Spacewalk-18, with its inherent multimodal and long-form complexity, exposes the high difficulty of task recognition and segmentation. We find that state-of-the-art methods perform poorly on our benchmark, but improvements can be obtained by incorporating information from longer-range temporal context across different modalities. Our experiments underscore the need to develop new approaches to these tasks.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Youtube video -->
<!-- <section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">

      <h2 class="title is-3">Video Presentation</h2>
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          
          <div class="publication-video">

            <iframe src="https://www.youtube.com/embed/JkaxUblCGz0" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
          </div>
        </div>
      </div>
    </div>
  </div>
</section> -->
<!-- End youtube video -->


<!-- Video carousel -->
<!-- <section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Another Carousel</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">

            <source src="static/videos/carousel1.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">

            <source src="static/videos/carousel2.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\

            <source src="static/videos/carousel3.mp4"
            type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section> -->
<!-- End video carousel -->

<!-- Intro-->
<section class="hero is-small">
  <div class="hero-body" style="font-size: large;">
    <div class="container">
      <h2 class="title is-3">Spacewalk-18 Dataset</h2>
      <div class="container">
        <div class="hero-body">
          <img src="static/videos/teaser.gif" alt="teaser"/>
        </div>
      </div>
     </div>
      </div>
    </div>
  </section>
<!-- End Intro-->

<!-- Statistics-->
<section class="hero is-small">
  <div class="hero-body" style="font-size: large;">
    <div class="container">
      <h2 class="title is-3">Statistics</h2>
        <p>Spacewalk-18 annotates 18 spacewalk videos from 2019 to 2023 with a total length of 96 hours. On average, each spacewalk task consists of 25 steps. Each step has an average of 12 minutes of animation video. </p>
        <div id="results-carousel" class="carousel results-carousel">
          <div class="item">
           <!-- Your image here -->
            <img src="static/images/merged_durations.png" alt="durations"/>
           <h2 class="subtitle has-text-centered">
            Distribution of shot durations
           </h2>
         </div>
         <div class="item">
           <!-- Your image here -->
           <img src="static/images/wordcloud.png" alt="wordcloud"/>
           <h2 class="subtitle has-text-centered">
            Word cloud of the transcriptions
           </h2>
         </div>
     </div>
      </div>
    </div>
  </section>
<!-- End Statistics-->

<!-- Tasks-->
<section class="hero is-small">
  <div class="hero-body" style="font-size: large;">
    <div class="container">
      <h2 class="title is-3">Tasks</h2>
      <p>Two multimodal long-form video understanding tasks are defined on Spacewalk-18 - <b> step recognition </b> and <b> intra-video retrieval</b>.</p>
      <br>
      <p><b> Step recognition.</b> Given a timestamp and a context window length, step recognition aims to recognize the task step that the timestamp belongs to.</p>
      <br>
      <p><b> Intra-video retrieval.</b> Given a query timestamp, two candidate timestamps with the same time distances to the query, and a context window length, intra-video retrieval aims to determine the candidate that belongs to the same task step as the query.</p>
      <div class="container">
        <div class="hero-body">
          <img src="static/images/task_img.png" alt="tasks"/>
        </div>
      </div>
      </div>
    </div>
  </div>
</section>
<!--End Tasks -->

<!-- Evaluations-->
<section class="hero is-small"> 
  <div class="hero-body" style="font-size: large;">
    <div class="container">
      <h2 class="title is-3">Evaluation Results</h2>
        <p>We evaluate a few pretrained models in both zero-shot and fine-tuning scenarios. All the models perform poorly on our tasks and significantly worse than humans.</p>
        <div id="results-carousel" class="carousel results-carousel">
          <div class="item">
           <!-- Your image here -->
            <img src="static/images/table_steprecog.png" alt="steprecog"/>
           <h2 class="subtitle has-text-centered">
            Evaluation on step recognition task
           </h2>
         </div>
         <div class="item">
           <!-- Your image here -->
           <img src="static/images/table_retrieval.png" alt="retrieval"/>
           <h2 class="subtitle has-text-centered">
            Evaluation on intra-video retrieval task
           </h2>
         </div>
     </div>
      </div>
    </div>
  </section>
<!-- End Evaluations-->


<!-- Paper poster -->
<!-- <section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title">Poster</h2>

      <iframe  src="static/pdfs/sample.pdf" width="100%" height="550">
          </iframe>
        
      </div>
    </div>
  </section> -->
<!--End paper poster -->


<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@misc{krishnan2023spacewalk18,
        title={Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video Understanding in Novel Domains}, 
        author={Rohan Myer Krishnan and Zitian Tang and Zhiqiu Yu and Chen Sun},
        year={2023},
        eprint={2311.18773},
        archivePrefix={arXiv},
        primaryClass={cs.CV}
  }</code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>