-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [sc-8452] Add RSS connector * Fix format
- Loading branch information
Showing
6 changed files
with
202 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
# 1.2.5 (2024-03-22) | ||
|
||
- Add RSS connector | ||
|
||
# 1.2.4 (2024-03-18) | ||
|
||
- Support CSS selectors in sitemap component | ||
|
122 changes: 122 additions & 0 deletions
122
server/src/logic/connector/infrastructure/connectors/rss.connector.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import { from, map, Observable, of } from 'rxjs'; | ||
import { ConnectorParameters, FileStatus, IConnector, Link, SearchResults, SyncItem } from '../../domain/connector'; | ||
import { SourceConnectorDefinition } from '../factory'; | ||
import * as cheerio from 'cheerio'; | ||
import { TO_BE_CHECKED } from '../../../sync/domain/sync.entity'; | ||
|
||
interface RSSItem { | ||
link: string; | ||
pubDate: string; | ||
} | ||
|
||
async function fetchRSS(url: string): Promise<string> { | ||
try { | ||
const response = await fetch(url); | ||
return response.text(); | ||
} catch (error) { | ||
console.error('Error fetching RSS feed', error); | ||
return Promise.resolve(''); | ||
} | ||
} | ||
|
||
export function parseRSS(rss: string): RSSItem[] { | ||
const $ = cheerio.load(rss, { xml: true }); | ||
return $('item') | ||
.toArray() | ||
.map((item) => { | ||
const date = new Date($(item).find('pubDate').text()); | ||
const validDate = !Number.isNaN(date.getDate()); | ||
return { | ||
link: $(item).find('link').text(), | ||
pubDate: validDate ? date.toISOString() : '', | ||
}; | ||
}) | ||
.filter((item) => item.link && item.pubDate); | ||
} | ||
|
||
export const RSSConnector: SourceConnectorDefinition = { | ||
id: 'rss', | ||
factory: () => new RSSImpl(), | ||
}; | ||
|
||
class RSSImpl implements IConnector { | ||
isExternal = true; | ||
params: ConnectorParameters = {}; | ||
|
||
hasAuthData(): boolean { | ||
return true; | ||
} | ||
|
||
setParameters(params: ConnectorParameters) { | ||
this.params = params; | ||
} | ||
|
||
areParametersValid(params: ConnectorParameters): boolean { | ||
return !!params?.url; | ||
} | ||
|
||
getParameters(): ConnectorParameters { | ||
return this.params; | ||
} | ||
|
||
getFolders(): Observable<SearchResults> { | ||
throw new Error('Method not implemented.'); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
getFiles(query?: string): Observable<SearchResults> { | ||
const url = this.params['url']; | ||
|
||
return from(fetchRSS(url)).pipe( | ||
map((content) => parseRSS(content)), | ||
map((items) => ({ | ||
items: items.map((item) => ({ | ||
title: item.link, | ||
status: FileStatus.PENDING, | ||
uuid: `${new Date().getTime()}`, | ||
originalId: item.link, | ||
mimeType: TO_BE_CHECKED, | ||
modifiedGMT: item.pubDate, | ||
metadata: { | ||
uri: item.link, | ||
}, | ||
})), | ||
})), | ||
); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
download(resource: SyncItem): Observable<Blob | undefined> { | ||
throw new Error('Method "download" not implemented.'); | ||
} | ||
|
||
getLink(resource: SyncItem): Observable<Link> { | ||
return of({ | ||
uri: resource.metadata['uri'], | ||
extra_headers: {}, | ||
cssSelector: this.getParameters().cssSelector, | ||
}); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
getFilesFromFolders(folders: SyncItem[]): Observable<SearchResults> { | ||
return this.getFiles(); | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-unused-vars | ||
getLastModified(since: string, folders?: SyncItem[]): Observable<SearchResults> { | ||
return this.getFiles().pipe( | ||
map((searchResults) => ({ | ||
...searchResults, | ||
items: searchResults.items.filter((item) => item.modifiedGMT && item.modifiedGMT > since), | ||
})), | ||
); | ||
} | ||
|
||
refreshAuthentication(): Observable<boolean> { | ||
return of(true); | ||
} | ||
isAccessTokenValid(): Observable<boolean> { | ||
return of(true); | ||
} | ||
} |
25 changes: 25 additions & 0 deletions
25
server/src/logic/connector/infrastructure/connectors/tests/rss-data.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
export const RSS_SAMPLE = ` | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> | ||
<channel> | ||
<title>Nuclia Blog</title> | ||
<description>Nuclia blog</description> | ||
<link>https://nuclia.com/blog</link> | ||
<lastBuildDate>Fri, 22 Mar 2024 03:57:36 GMT</lastBuildDate> | ||
<atom:link href="https://nuclia.com/blog/rss" rel="self" type="application/rss+xml"/> | ||
<item> | ||
<title><![CDATA[How to build generative AI search for your data]]></title> | ||
<description><![CDATA[How to build generative AI search for your data]]></description> | ||
<link>https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/</link> | ||
<guid isPermaLink="true">https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/</guid> | ||
<pubDate>Wed, 14 Mar 2024 12:00:00 GMT</pubDate> | ||
</item> | ||
<item> | ||
<title>Streamline LLM quality assurance for RAG</title> | ||
<description>Streamline LLM quality assurance for RAG></description> | ||
<link>https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/</link> | ||
<guid isPermaLink="true">https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/</guid> | ||
<pubDate>Wed, 11 Mar 2024 09:00:00 GMT</pubDate> | ||
</item> | ||
</channel> | ||
</rss>`; |
48 changes: 48 additions & 0 deletions
48
server/src/logic/connector/infrastructure/connectors/tests/rss.connector.spec.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import { describe, expect, test } from 'vitest'; | ||
import { getConnector } from '../../factory'; | ||
import { parseRSS } from '../rss.connector'; | ||
import { RSS_SAMPLE } from './rss-data'; | ||
|
||
const rssTest = test.extend({ | ||
// eslint-disable-next-line no-empty-pattern | ||
sourceConnector: async ({}, use) => { | ||
const connectorDefinition = getConnector('rss'); | ||
const sourceConnector = connectorDefinition.factory(); | ||
await use(sourceConnector); | ||
}, | ||
}); | ||
|
||
describe('Test validate sitemap params', () => { | ||
rssTest('Incorrect - Without params', ({ sourceConnector }) => { | ||
expect(sourceConnector.areParametersValid({})).toBe(false); | ||
}); | ||
rssTest('Incorrect - With wrong params', ({ sourceConnector }) => { | ||
expect(sourceConnector.areParametersValid({ incorrect: 'test' })).toBe(false); | ||
}); | ||
|
||
rssTest('Incorrect - With empty params', ({ sourceConnector }) => { | ||
expect(sourceConnector.areParametersValid({ url: '' })).toBe(false); | ||
}); | ||
|
||
rssTest('Correct - With correct params', ({ sourceConnector }) => { | ||
expect(sourceConnector.areParametersValid({ url: 'http://somewhere/rss.xml' })).toBe(true); | ||
}); | ||
}); | ||
|
||
describe('Test RSS parser', () => { | ||
test('Should return empty list if content is invalid', () => { | ||
expect(parseRSS('Invalid XML')).toEqual([]); | ||
}); | ||
test('Should parse the RSS format', () => { | ||
expect(parseRSS(RSS_SAMPLE)).toEqual([ | ||
{ | ||
link: 'https://nuclia.com/developers/how-to-build-generative-ai-search-for-your-data/', | ||
pubDate: '2024-03-14T12:00:00.000Z', | ||
}, | ||
{ | ||
link: 'https://nuclia.com/developers/streamline-llm-quality-assurance-for-rag/', | ||
pubDate: '2024-03-11T09:00:00.000Z', | ||
}, | ||
]); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters