Skip to content

Commit

Permalink
release TxT360
Browse files Browse the repository at this point in the history
  • Loading branch information
caris-mu committed Oct 7, 2024
1 parent a79b0ce commit 9f13977
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 34 deletions.
2 changes: 1 addition & 1 deletion about.html
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
</a>
<h2>LLM360</h2>
<ul>
<!-- <li><a href="index.html#datasets">Datasets</a></li>-->
<li><a href="index.html#datasets">Datasets</a></li>
<li><a href="index.html#models">Models</a></li>
<li><a href="index.html#paper">Papers</a></li>
<li><a href="index.html#blogs">Blogs</a></li>
Expand Down
58 changes: 49 additions & 9 deletions assets/css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -4429,17 +4429,57 @@ body.is-preload #sidebar > .inner {
}
}

/*.alert-banner {*/
/* background-image: linear-gradient(to right, rgba(136, 215, 17, 0.95), rgba(4, 103, 229, 0.95));*/
/* color: white; !* White text *!*/
/* padding: 10px; !* Some padding *!*/
/* text-align: center; !* Centered text *!*/
/* position: relative; !* Relative positioning *!*/
/* width: 100%; !* Full width *!*/
/* top: 0; !* Stick to top *!*/
/* left: 0; !* Align to the left *!*/
/* box-shadow: 0 4px 8px rgba(0,0,0,0.1); !* Add a subtle shadow *!*/
/* z-index: 1000; !* Ensure it's on top *!*/
/*}*/
.alert-banner {
background-image: linear-gradient(to right, rgba(136, 215, 17, 0.95), rgba(4, 103, 229, 0.95));
color: white; /* White text */
padding: 10px; /* Some padding */
text-align: center; /* Centered text */
position: relative; /* Relative positioning */
width: 100%; /* Full width */
top: 0; /* Stick to top */
left: 0; /* Align to the left */
box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Add a subtle shadow */
z-index: 1000; /* Ensure it's on top */
color: white;
padding: 10px;
text-align: center;
position: relative;
width: 100%;
top: 0;
left: 0;
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
z-index: 1000;
}

.alert-banner a {
color: white; /* Ensures link text is white */
text-decoration: none;
}

.arrow-btn {
background-color: transparent;
color: white;
border: none;
font-size: 18px;
cursor: pointer;
position: absolute;
top: 50%;
transform: translateY(-50%);
}

#prevBtn {
left: 10px; /* Left arrow button */
}

#nextBtn {
right: 10px; /* Right arrow button */
}

.arrow-btn:hover {
color: rgba(255, 255, 255, 0.8); /* Slight hover effect */
}

.tags {
Expand Down
32 changes: 32 additions & 0 deletions assets/js/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,38 @@
});
}
});

// making news banner change
const bannerContent = document.getElementById("bannerContent");
const messages = [
{
text: 'Announcing our first dataset <strong>TxT360</strong>: Learn More Here.',
link: 'https://huggingface.co/spaces/LLM360/TxT360'
},
{
text: 'The Institute of Foundation Model is <strong>hiring</strong> !',
link: 'https://mbzuai.ac.ae/institute-of-foundation-models/' // Change this to your hiring link
}
];

let currentIndex = 0;

function updateBanner() {
bannerContent.innerHTML = messages[currentIndex].text;
bannerContent.href = messages[currentIndex].link;
}

// Previous button action
document.getElementById("prevBtn").addEventListener("click", () => {
currentIndex = (currentIndex === 0) ? messages.length - 1 : currentIndex - 1;
updateBanner();
});

// Next button action
document.getElementById("nextBtn").addEventListener("click", () => {
currentIndex = (currentIndex === messages.length - 1) ? 0 : currentIndex + 1;
updateBanner();
});

// Scroll to see all pictures in the gallery
document.addEventListener('DOMContentLoaded', function () {
Expand Down
2 changes: 1 addition & 1 deletion evaluation.html
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
</a>
<h2>LLM360</h2>
<ul>
<!-- <li><a href="index.html#datasets">Datasets</a></li>-->
<li><a href="index.html#datasets">Datasets</a></li>
<li><a href="index.html#models">Models</a></li>
<li><a href="index.html#paper">Papers</a></li>
<li><a href="index.html#blogs">Blogs</a></li>
Expand Down
50 changes: 27 additions & 23 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
</a>
<h2>LLM360</h2>
<ul>
<!-- <li><a href="#datasets">Datasets</a></li>-->
<li><a href="#datasets">Datasets</a></li>
<li><a href="#models">Models</a></li>
<li><a href="#paper">Papers</a></li>
<li><a href="#blogs">Blogs</a></li>
Expand All @@ -123,7 +123,13 @@ <h2>LLM360</h2>
<!-- <div class="alert-banner" id="alertBanner">-->
<!-- <a href="https://huggingface.co/spaces/LLM360/TxT360" target="_blank">Announcing our first dataset <strong>TxT360</strong>: Learn More Here.</a>-->
<!-- </div>-->

<div class="alert-banner" id="alertBanner">
<a href="https://huggingface.co/spaces/LLM360/TxT360" target="_blank" id="bannerContent">
Announcing our first dataset <strong>TxT360</strong>: Learn More Here.
</a>
<a id="prevBtn" class="arrow-btn">&#9664;</a> <!-- Left arrow -->
<a id="nextBtn" class="arrow-btn">&#9654;</a> <!-- Right arrow -->
</div>
<!-- Intro -->
<section id="intro" class="wrapper style1 fullscreen fade-up">
<div class="inner">
Expand All @@ -138,27 +144,25 @@ <h1><strong>LLM360</strong> enables <strong>community-owned AI</strong> through
</section>

<!-- Datasets -->
<!-- <section id="datasets" class="wrapper style2 spotlights">-->
<!-- <div class="inner">-->
<!-- <h1>Datasets</h1>-->
<!-- </div>-->
<!-- <section id="txt360">-->
<!-- <span class="image">-->
<!-- <img src="images/txt360_logo.png" alt=""/>-->
<!-- </span>-->
<!-- <div class="content">-->
<!-- <h3><strong>TxT360</strong>: the most comprehensive, highest quality, and production ready pretraining dataset</h3>-->
<!-- <p>-->
<!-- Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.-->
<!-- <br>-->
<!-- We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects.-->
<!-- </p>-->
<!-- <ul class="actions">-->
<!-- <li><a href="https://huggingface.co/spaces/LLM360/TxT360" target="_blank" class="button">Learn more</a></li>-->
<!-- </ul>-->
<!-- </div>-->
<!-- </section>-->
<!-- </section>-->
<section id="datasets" class="wrapper style2 spotlights">
<div class="inner">
<h1>Datasets</h1>
</div>
<section id="txt360">
<span class="image">
<img src="images/txt360_logo.png" alt=""/>
</span>
<div class="content">
<h2>TxT360</h2>
<p>
We introduce <strong>TxT360 (Trillion eXtracted Text)</strong>, the <strong>first</strong> dataset to <strong>globally deduplicate</strong> 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution.
</p>
<ul class="actions">
<li><a href="https://huggingface.co/spaces/LLM360/TxT360" target="_blank" class="button">Learn more</a></li>
</ul>
</div>
</section>
</section>

<!-- Models -->
<section id="models" class="wrapper style2 spotlights">
Expand Down

0 comments on commit 9f13977

Please sign in to comment.