Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Default mail if nothing new since last newsletter #12

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ endif
help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

init: ## Initialize the project
make install
make build
init: install build ## Initialize the project

install: ## Install the dependencies
npm install
Expand All @@ -23,9 +21,9 @@ webpage: ## Run the webpage localy
npm run format
cd website && npm run start

send_mail: ## Send newsletter mail
sendMail: ## Send newsletter mail
cp -n .env.sample .env
npx ts-node curator/src/mail_agent/newsletter_script.ts
npx ts-node curator/src/mailAgent/newsletterScript.ts

build: ## Compile the project
npm run build
Expand All @@ -39,13 +37,13 @@ dev: ## Run the CLI in development mode
npm run format
npm --workspace website run dev

conv_agent: ## Test the conversational agent with mail
convAgent: ## Test the conversational agent with mail
npm run format
npm --workspace conversational_agent run start
npm --workspace conversationalAgent run start

conv_agent_test: ## Test the conversational agent
convAgentTest: ## Test the conversational agent
npm run format
npm --workspace conversational_agent run test
npm --workspace conversationalAgent run test

clean: ## To clean the project
rm -rf node_modules
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
9 changes: 7 additions & 2 deletions curator/src/curate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export interface CurateOptions {
links: string[];
interests?: string[];
max?: number;
minimumDate?: Date;
maxContentSize?: number;
onProgress?: (progress: number) => void;
}
Expand All @@ -14,13 +15,15 @@ export const curate = async ({
links,
interests = [],
max = 5,
minimumDate,
maxContentSize,
onProgress,
}: CurateOptions) => {
const summaries = await scrapeAndSummarizeArticles({
links,
interests,
maxContentSize,
minimumDate,
onProgress,
});
const mostRelevantArticles = getMostRelevant({ summaries, max });
Expand All @@ -31,20 +34,22 @@ interface ScrapeAndSummarizeArticlesOptions {
links: string[];
interests?: string[];
maxContentSize?: number;
minimumDate?: Date;
onProgress?: (progress: number) => void;
}

const scrapeAndSummarizeArticles = async ({
links,
interests = [],
minimumDate = new Date(0),
maxContentSize,
onProgress = () => {},
}: ScrapeAndSummarizeArticlesOptions) => {
const content: Summary[] = [];
for (let i = 0; i < links.length; i++) {
const text = await scrape({ url: links[i], maxContentSize });
const {text, date} = await scrape({ url: links[i], maxContentSize });
onProgress(i + 0.5);
if (text === undefined) {
if (text === undefined || date < minimumDate) {
onProgress(i + 1);
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ function formatNewsletterMarkdown(articles: Summary[]) {
> ${summary}\n\n`;
};

// If there are no new articles
if (articles.length === 0) {
return `
Newsletter\n\n
Hello hello ! Sorry, no new articles today. But don't worry, we'll be back soon with more news!\n
See you soon!\n
`;
}

return `
Newsletter\n\n
Hello everyone! Here are the latest news!\n\n
Expand Down Expand Up @@ -57,6 +66,16 @@ function formatNewsletterHtmlWithCSS(articles: Summary[]) {
</div>`;
};

// If there are no new articles
if (articles.length === 0) {
return `
<div style="font-family: Arial, sans-serif; background-color: #f9f9f9; color: #333; padding: 20px; border-radius: 10px; max-width: 800px; margin: 0 auto;">
<h1 style="color: #4CAF50; text-align: center; font-size: 32px;">Newsletter</h1>
<p style="font-size: 18px; text-align: center;">Hello hello ! Sorry, no new articles today. But don't worry, we'll be back soon with more news!</p>
<p style="font-size: 18px; text-align: center;">See you soon!</p>
</div>`;
}

// Main newsletter structure
const htmlNewsletter = `
<div style="font-family: Arial, sans-serif; background-color: #f9f9f9; color: #333; padding: 20px; border-radius: 10px; max-width: 800px; margin: 0 auto;">
Expand All @@ -70,23 +89,25 @@ function formatNewsletterHtmlWithCSS(articles: Summary[]) {

// TODO : add parameters based on user (links,interests?,maxArticles?,maxContentSize?)

export async function curateAndGenerateNewsletter(): Promise<{
export async function curateAndGenerateNewsletter()
: Promise<{
markdown: string;
html: string;
}> {
}>
{
return curate({
links,
interests: ['react', 'ai'],
interests: ['lego'],
max: 5,
minimumDate : new Date('2025-02-01'),
Aurelien-Gindre marked this conversation as resolved.
Show resolved Hide resolved
})
.then((curatedLinks: Summary[]) => {
// Generate the formatted string when promise completed

const markdown = formatNewsletterMarkdown(curatedLinks);
const html = formatNewsletterHtmlWithCSS(curatedLinks);

// Returns raw json and formatted newletters

return { markdown, html };
})
.catch((err: unknown) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { ServerClient } from 'postmark';

dotenv.config({ path: './../.env' });

const sendMail = true;
const sendMail = false;

// Calls the curateAndGenerateNewsletter function with right parameters then sends the mail
// TODO : Add all the dynamic part (newsletter parameters, email params)
Expand Down
74 changes: 73 additions & 1 deletion curator/src/scrape.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Readability } from '@mozilla/readability';
import * as cheerio from 'cheerio';
import { JSDOM } from 'jsdom';
import { parse, isValid } from 'date-fns';

export interface ScrapeOptions {
url: string;
Expand All @@ -15,5 +16,76 @@ export const scrape = async ({
const $ = cheerio.load(await response.text());
const dom = new JSDOM($.html());
const article = new Readability(dom.window.document).parse();
return article?.textContent.substring(0, maxContentSize);
const publicationDate = await scrapeDate($);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought: I'm afraid the way to get the publication date of the article is a bit random. For example, targeting the HTML element to hope to find the publication date with .publication-date, #pub-date is really specific and will only work for a few pages!
How to be sure that there is a publication date in the page and how to be sure to get the right one if there are several dates in the page?
IMO we can find a better way to handle this feature (using the database for instance).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the moment it is the only way I found to find the date. The original scraper might mistake a date in the text as the publication date.
It only fetches the fields likely to have the publication date, and if multiple are found, it takes the most recent one.


return {
text: article?.textContent.substring(0, maxContentSize),
date: publicationDate
};
};

async function scrapeDate($: cheerio.CheerioAPI) {
// Extract the latest dazte of publication
Aurelien-Gindre marked this conversation as resolved.
Show resolved Hide resolved
let latestDate: Date | null = null;
let publicationDate: string | null = null;

const dateFormats = [
"yyyy-MM-dd'T'HH:mm:ssXXX",
"yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
"yyyy-MM-dd",
"MM/dd/yyyy",
"dd-MM-yyyy",
"MMMM d, yyyy",
"MMM d, yyyy",
"d MMMM yyyy",
"d MMM yyyy"
];

$('time').each((index, element) => {
const timeTag = $(element);
const dateTimeAttr = timeTag.attr('datetime');
const dateText = timeTag.text();
let date: Date | null = null;

if (dateTimeAttr) {
date = new Date(dateTimeAttr);
} else if (dateText) {
for (const format of dateFormats) {
const parsedDate = parse(dateText, format, new Date());
if (isValid(parsedDate)) {
date = parsedDate;
break;
}
}
}

if (date && (!latestDate || date > latestDate)) {
latestDate = date;
publicationDate = date.toISOString();
}
});

// Check for meta tags with name="date" or property="article:published_time"
if (!publicationDate) {
const metaDate = $('meta[name="date"], meta[property="article:published_time"]');
if (metaDate.length > 0) {
publicationDate = metaDate.attr('content') ?? null;
}
}

// Check for specific classes or IDs (example: .publication-date, #pub-date)
if (!publicationDate) {
const specificDate = $('.publication-date, #pub-date');
if (specificDate.length > 0) {
publicationDate = specificDate.text();
}
}

if (!publicationDate) {
console.warn('No publication date found');
return new Date(0); // Return the Unix epoch start date (January 1, 1970)
}

return new Date(publicationDate);

}
11 changes: 11 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"@types/jsdom": "^21.1.7"
},
"dependencies": {
"date-fns": "^4.1.0",
"dompurify": "^3.2.4",
"jsdom": "^26.0.0"
}
Expand Down
2 changes: 1 addition & 1 deletion website/src/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"answer3": "Just reply to any email from the service to add topics, sources, or adjust your preferences.",
"question4": "How can I customize my newsletter?",
"answer4": "You can customize the frequency, topics, sources, number of articles, and summary length. Just reply to an email with your preferences. For example:\n\n'Hello Curator AI, I’d like to receive my newsletter every Monday and Thursday at 8 AM. Please include news about AI, healthcare, and finance. Add the websites https://www.lemonde.fr/ and https://www.techcrunch.com/ to my sources. Limit the newsletter to 5 articles per issue, with concise summaries.'"
},
},
"footer": {
"copyright": "Copyright &copy; {{date}} Marmelab",
"allRightsReserved": "All rights reserved."
Expand Down