Skip to content

Commit

Permalink
Browse: full support for markdown transform
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed May 13, 2024
1 parent 7946cd6 commit 7b07bb7
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 41 deletions.
73 changes: 53 additions & 20 deletions src/modules/browse/BrowseSettings.tsx
Original file line number Diff line number Diff line change
@@ -1,62 +1,95 @@
import * as React from 'react';
import { shallow } from 'zustand/shallow';
import { useShallow } from 'zustand/react/shallow';

import { Checkbox, FormControl, FormHelperText } from '@mui/joy';
import { Checkbox, FormControl, FormHelperText, Option, Select, Typography } from '@mui/joy';

import { FormInputKey } from '~/common/components/forms/FormInputKey';
import { Link } from '~/common/components/Link';
import { platformAwareKeystrokes } from '~/common/components/KeyStroke';

import { useBrowseCapability, useBrowseStore } from './store-module-browsing';
import { ExternalLink } from '~/common/components/ExternalLink';
import { FormLabelStart } from '~/common/components/forms/FormLabelStart';


export function BrowseSettings() {

// external state
const { mayWork, isServerConfig, isClientValid, inCommand, inComposer, inReact } = useBrowseCapability();
const { wssEndpoint, setWssEndpoint, setEnableCommandBrowse, setEnableComposerAttach, setEnableReactTool } = useBrowseStore(state => ({
const { mayWork, isServerConfig, isClientValid, inCommand, inComposer, inReact, inPersonas } = useBrowseCapability();
const {
wssEndpoint, setWssEndpoint,
pageTransform, setPageTransform,
setEnableCommandBrowse, setEnableComposerAttach, setEnableReactTool, setEnablePersonaTool,
} = useBrowseStore(useShallow(state => ({
wssEndpoint: state.wssEndpoint,
pageTransform: state.pageTransform,
setPageTransform: state.setPageTransform,
setWssEndpoint: state.setWssEndpoint,
setEnableCommandBrowse: state.setEnableCommandBrowse,
setEnableComposerAttach: state.setEnableComposerAttach,
setEnableReactTool: state.setEnableReactTool,
}), shallow);
setEnablePersonaTool: state.setEnablePersonaTool,
})));

const handlePageTransformChange = (_event: any, value: typeof pageTransform | null) => value && setPageTransform(value);


return <>

<FormHelperText sx={{ display: 'block' }}>
Configure a browsing service to enable loading links and pages. See the <Link
href='https://github.com/enricoros/big-agi/blob/main/docs/config-feature-browse.md' target='_blank' noLinkStyle>
browse configuration guide</Link> for more information.
</FormHelperText>
<Typography level='body-sm'>
Configure Browsing to enable loading links and web pages. <ExternalLink
href='https://github.com/enricoros/big-agi/blob/main/docs/config-feature-browse.md'>
Learn more</ExternalLink>.
</Typography>

<FormInputKey
autoCompleteId='browse-wss' label='Puppeteer Endpoint' noKey
autoCompleteId='browse-wss' label='Puppeteer Wss' noKey
value={wssEndpoint} onChange={setWssEndpoint}
rightLabel={!isServerConfig ? 'required' : '✔️ already set in server'}
required={!isServerConfig} isError={!isClientValid && !isServerConfig}
placeholder='wss://...'
/>


<FormControl orientation='horizontal' sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
<FormLabelStart title='Load pages as:' />
<Select
variant='outlined'
value={pageTransform} onChange={handlePageTransformChange}
slotProps={{
root: { sx: { minWidth: '140px' } },
indicator: { sx: { opacity: 0.5 } },
button: { sx: { whiteSpace: 'inherit' } },
}}
>
<Option value='text'>Text (default)</Option>
<Option value='markdown'>Markdown</Option>
<Option value='html'>HTML</Option>
</Select>
</FormControl>


<Typography level='body-sm' sx={{ mt: 2 }}>Browsing enablement:</Typography>

<FormControl disabled={!mayWork}>
<Checkbox variant='outlined' label='Attach URLs' checked={inComposer} onChange={(event) => setEnableComposerAttach(event.target.checked)} />
<FormHelperText>{platformAwareKeystrokes('Load and attach a page when pasting a URL')}</FormHelperText>
<Checkbox size='sm' label='Paste URLs' checked={inComposer} onChange={(event) => setEnableComposerAttach(event.target.checked)} />
<FormHelperText>{platformAwareKeystrokes('Load and attach when pasting a URL')}</FormHelperText>
</FormControl>

<FormControl disabled={!mayWork}>
<Checkbox variant='outlined' label='/browse' checked={inCommand} onChange={(event) => setEnableCommandBrowse(event.target.checked)} />
<Checkbox size='sm' label='/browse' checked={inCommand} onChange={(event) => setEnableCommandBrowse(event.target.checked)} />
<FormHelperText>{platformAwareKeystrokes('Use /browse to load a web page')}</FormHelperText>
</FormControl>

<FormControl disabled={!mayWork}>
<Checkbox variant='outlined' label='ReAct' checked={inReact} onChange={(event) => setEnableReactTool(event.target.checked)} />
<Checkbox size='sm' label='ReAct' checked={inReact} onChange={(event) => setEnableReactTool(event.target.checked)} />
<FormHelperText>Enables loadURL() in ReAct</FormHelperText>
</FormControl>

{/*<FormControl disabled>*/}
{/* <Checkbox variant='outlined' label='Personas' checked={inPersonas} onChange={(event) => setEnablePersonaTool(event.target.checked)} />*/}
{/* <FormHelperText>Enable loading URLs by Personas</FormHelperText>*/}
{/*</FormControl>*/}
<FormControl disabled>
<Checkbox size='sm' label='Chat with Personas' checked={false} onChange={(event) => setEnablePersonaTool(event.target.checked)} />
<FormHelperText>Not yet available</FormHelperText>
{/*<FormHelperText>Enable loading URLs by Personas</FormHelperText>*/}
</FormControl>

</>;
}
8 changes: 4 additions & 4 deletions src/modules/browse/browse.client.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { useBrowseStore } from '~/modules/browse/store-module-browsing';
import { BrowsePageTransform, useBrowseStore } from '~/modules/browse/store-module-browsing';

import { apiAsyncNode } from '~/common/util/trpc.client';

Expand All @@ -7,7 +7,7 @@ import { apiAsyncNode } from '~/common/util/trpc.client';
const DEBUG_SHOW_SCREENSHOT = false;


export async function callBrowseFetchPage(url: string) {
export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePageTransform) {

// thow if no URL is provided
url = url?.trim() || '';
Expand All @@ -19,7 +19,7 @@ export async function callBrowseFetchPage(url: string) {
if (!url.startsWith('http://') && !url.startsWith('https://'))
url = 'https://' + url;

const clientWssEndpoint = useBrowseStore.getState().wssEndpoint;
const { wssEndpoint: clientWssEndpoint, pageTransform } = useBrowseStore.getState();

const { pages } = await apiAsyncNode.browse.fetchPages.mutate({
access: {
Expand All @@ -28,7 +28,7 @@ export async function callBrowseFetchPage(url: string) {
},
subjects: [{
url,
transform: 'markdown',
transform: pageTransform || 'text',
}],
screenshot: DEBUG_SHOW_SCREENSHOT ? {
width: 512,
Expand Down
46 changes: 29 additions & 17 deletions src/modules/browse/browse.router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import { TRPCError } from '@trpc/server';

import { BrowserContext, connect, ScreenshotOptions, TimeoutError } from '@cloudflare/puppeteer';

/**
* Puppeteer implementation of the worker
*/
import TurndownService from 'turndown';

import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server';
Expand Down Expand Up @@ -88,12 +91,14 @@ type BrowseAccessSchema = z.infer<typeof browseAccessSchema>;
type FetchPageWorkerOutputSchema = z.infer<typeof fetchPageWorkerOutputSchema>;


/**
* Puppeteer implementation of the worker
*/
async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string, transform: PageTransformSchema, ssWidth: number | undefined, ssHeight: number | undefined, ssQuality: number | undefined): Promise<FetchPageWorkerOutputSchema> {

// access
async function workerPuppeteer(
access: BrowseAccessSchema,
targetUrl: string,
transform: PageTransformSchema,
ssWidth: number | undefined,
ssHeight: number | undefined,
ssQuality: number | undefined,
): Promise<FetchPageWorkerOutputSchema> {
const browserWSEndpoint = (access.wssEndpoint || env.PUPPETEER_WSS_ENDPOINT || '').trim();
const isLocalBrowser = browserWSEndpoint.startsWith('ws://');
if (!browserWSEndpoint || (!browserWSEndpoint.startsWith('wss://') && !isLocalBrowser))
Expand Down Expand Up @@ -132,34 +137,41 @@ async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string, tr
result.stopReason = 'end';
}
} catch (error: any) {
const isTimeout: boolean = error instanceof TimeoutError;
const isTimeout = error instanceof TimeoutError;
result.stopReason = isTimeout ? 'timeout' : 'error';
if (!isTimeout)
result.error = '[Puppeteer] ' + error?.message || error?.toString() || 'Unknown goto error';
if (!isTimeout) {
result.error = '[Puppeteer] ' + (error?.message || error?.toString() || 'Unknown goto error');
}
}

// transform the content of the page as text
try {
if (result.stopReason !== 'error') {
switch (transform) {
case 'html':
result.content = await page.evaluate(() => document.documentElement.innerHTML);
break;
case 'markdown':
const html = await page.evaluate(() => document.documentElement.innerHTML);
const turndownService = new TurndownService();
result.content = turndownService.turndown(html);
result.content = await page.content();
break;
case 'text':
default:
result.content = await page.evaluate(() => document.body.innerText || document.textContent || '');
break;
case 'markdown':
await page.evaluate(() => {
// Remove unnecessary elements
document.querySelectorAll('script, style, nav, footer, aside, header, .ads, .comments')
.forEach(el => el.remove());
});
const cleanedHtml = await page.content();
const turndownService = new TurndownService({
headingStyle: 'atx',
});
result.content = turndownService.turndown(cleanedHtml);
break;
}
if (!result.content)
result.error = '[Puppeteer] Empty content';
}
} catch (error: any) {
result.error = '[Puppeteer] ' + error?.message || error?.toString() || 'Unknown evaluate error';
result.error = '[Puppeteer] ' + (error?.message || error?.toString() || 'Unknown evaluate error');
}

// get a screenshot of the page
Expand Down
8 changes: 8 additions & 0 deletions src/modules/browse/store-module-browsing.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@ import { CapabilityBrowsing } from '~/common/components/useCapabilities';
import { getBackendCapabilities } from '~/modules/backend/store-backend-capabilities';


export type BrowsePageTransform = 'html' | 'text' | 'markdown';

interface BrowseState {

wssEndpoint: string;
setWssEndpoint: (url: string) => void;

pageTransform: BrowsePageTransform;
setPageTransform: (transform: BrowsePageTransform) => void;

enableCommandBrowse: boolean;
setEnableCommandBrowse: (value: boolean) => void;

Expand All @@ -31,6 +36,9 @@ export const useBrowseStore = create<BrowseState>()(
wssEndpoint: '', // default WSS endpoint
setWssEndpoint: (wssEndpoint: string) => set(() => ({ wssEndpoint })),

pageTransform: 'text',
setPageTransform: (pageTransform: BrowsePageTransform) => set(() => ({ pageTransform })),

enableCommandBrowse: true,
setEnableCommandBrowse: (enableCommandBrowse: boolean) => set(() => ({ enableCommandBrowse })),

Expand Down

0 comments on commit 7b07bb7

Please sign in to comment.