Skip to content

Commit

Permalink
[sc-8452] Add RSS connector (#33)
Browse files Browse the repository at this point in the history
* [sc-8452] Add RSS connector

* Fix format
  • Loading branch information
operramon authored Mar 25, 2024
1 parent 2fadf21 commit 3dc4086
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 1 deletion.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "nuclia-sync-agent-app",
"version": "1.2.4",
"version": "1.2.5",
"description": "This is a Nuclia Sync Agent App",
"main": "build/index.js",
"scripts": {
Expand Down
4 changes: 4 additions & 0 deletions server/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 1.2.5 (2024-03-22)

- Add RSS connector

# 1.2.4 (2024-03-18)

- Support CSS selectors in sitemap component
Expand Down
122 changes: 122 additions & 0 deletions server/src/logic/connector/infrastructure/connectors/rss.connector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import { from, map, Observable, of } from 'rxjs';
import { ConnectorParameters, FileStatus, IConnector, Link, SearchResults, SyncItem } from '../../domain/connector';
import { SourceConnectorDefinition } from '../factory';
import * as cheerio from 'cheerio';
import { TO_BE_CHECKED } from '../../../sync/domain/sync.entity';

interface RSSItem {
link: string;
pubDate: string;
}

async function fetchRSS(url: string): Promise<string> {
try {
const response = await fetch(url);
return response.text();
} catch (error) {
console.error('Error fetching RSS feed', error);
return Promise.resolve('');
}
}

export function parseRSS(rss: string): RSSItem[] {
const $ = cheerio.load(rss, { xml: true });
return $('item')
.toArray()
.map((item) => {
const date = new Date($(item).find('pubDate').text());
const validDate = !Number.isNaN(date.getDate());
return {
link: $(item).find('link').text(),
pubDate: validDate ? date.toISOString() : '',
};
})
.filter((item) => item.link && item.pubDate);
}

export const RSSConnector: SourceConnectorDefinition = {
id: 'rss',
factory: () => new RSSImpl(),
};

class RSSImpl implements IConnector {
isExternal = true;
params: ConnectorParameters = {};

hasAuthData(): boolean {
return true;
}

setParameters(params: ConnectorParameters) {
this.params = params;
}

areParametersValid(params: ConnectorParameters): boolean {
return !!params?.url;
}

getParameters(): ConnectorParameters {
return this.params;
}

getFolders(): Observable<SearchResults> {
throw new Error('Method not implemented.');
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
getFiles(query?: string): Observable<SearchResults> {
const url = this.params['url'];

return from(fetchRSS(url)).pipe(
map((content) => parseRSS(content)),
map((items) => ({
items: items.map((item) => ({
title: item.link,
status: FileStatus.PENDING,
uuid: `${new Date().getTime()}`,
originalId: item.link,
mimeType: TO_BE_CHECKED,
modifiedGMT: item.pubDate,
metadata: {
uri: item.link,
},
})),
})),
);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
download(resource: SyncItem): Observable<Blob | undefined> {
throw new Error('Method "download" not implemented.');
}

getLink(resource: SyncItem): Observable<Link> {
return of({
uri: resource.metadata['uri'],
extra_headers: {},
cssSelector: this.getParameters().cssSelector,
});
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
getFilesFromFolders(folders: SyncItem[]): Observable<SearchResults> {
return this.getFiles();
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
getLastModified(since: string, folders?: SyncItem[]): Observable<SearchResults> {
return this.getFiles().pipe(
map((searchResults) => ({
...searchResults,
items: searchResults.items.filter((item) => item.modifiedGMT && item.modifiedGMT > since),
})),
);
}

refreshAuthentication(): Observable<boolean> {
return of(true);
}
isAccessTokenValid(): Observable<boolean> {
return of(true);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
export const RSS_SAMPLE = `
<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<title>Nuclia Blog</title>
<description>Nuclia blog</description>
<link>https://nuclia.com/blog</link>
<lastBuildDate>Fri, 22 Mar 2024 03:57:36 GMT</lastBuildDate>
<atom:link href="https://nuclia.com/blog/rss" rel="self" type="application/rss+xml"/>
<item>
<title><![CDATA[How to build generative AI search for your data]]></title>
<description><![CDATA[How to build generative AI search for your data]]></description>
<link>https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/</link>
<guid isPermaLink="true">https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/</guid>
<pubDate>Wed, 14 Mar 2024 12:00:00 GMT</pubDate>
</item>
<item>
<title>Streamline LLM quality assurance for RAG</title>
<description>Streamline LLM quality assurance for RAG></description>
<link>https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/</link>
<guid isPermaLink="true">https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/</guid>
<pubDate>Wed, 11 Mar 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { describe, expect, test } from 'vitest';
import { getConnector } from '../../factory';
import { parseRSS } from '../rss.connector';
import { RSS_SAMPLE } from './rss-data';

const rssTest = test.extend({
// eslint-disable-next-line no-empty-pattern
sourceConnector: async ({}, use) => {
const connectorDefinition = getConnector('rss');
const sourceConnector = connectorDefinition.factory();
await use(sourceConnector);
},
});

describe('Test validate sitemap params', () => {
rssTest('Incorrect - Without params', ({ sourceConnector }) => {
expect(sourceConnector.areParametersValid({})).toBe(false);
});
rssTest('Incorrect - With wrong params', ({ sourceConnector }) => {
expect(sourceConnector.areParametersValid({ incorrect: 'test' })).toBe(false);
});

rssTest('Incorrect - With empty params', ({ sourceConnector }) => {
expect(sourceConnector.areParametersValid({ url: '' })).toBe(false);
});

rssTest('Correct - With correct params', ({ sourceConnector }) => {
expect(sourceConnector.areParametersValid({ url: 'http://somewhere/rss.xml' })).toBe(true);
});
});

describe('Test RSS parser', () => {
test('Should return empty list if content is invalid', () => {
expect(parseRSS('Invalid XML')).toEqual([]);
});
test('Should parse the RSS format', () => {
expect(parseRSS(RSS_SAMPLE)).toEqual([
{
link: 'https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/',
pubDate: '2024-03-14T12:00:00.000Z',
},
{
link: 'https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/',
pubDate: '2024-03-11T09:00:00.000Z',
},
]);
});
});
2 changes: 2 additions & 0 deletions server/src/logic/connector/infrastructure/factory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { GDriveConnector } from './connectors/gdrive.connector';
import { OneDriveConnector } from './connectors/onedrive.connector';
import { SharepointConnector } from './connectors/sharepoint.connector';
import { SitemapConnector } from './connectors/sitemap.connector';
import { RSSConnector } from './connectors/rss.connector';

export interface ConnectorDefinition {
id: string;
Expand All @@ -27,6 +28,7 @@ const connectors: { [id: string]: SourceConnectorDefinition } = {
sharepoint: SharepointConnector,
sitemap: SitemapConnector,
confluence: ConfluenceConnector,
rss: RSSConnector,
};

// TODO: add the dynamic connectors
Expand Down

0 comments on commit 3dc4086

Please sign in to comment.