import type { CheerioAPI, load as LoadT } from 'cheerio'; import { Document } from 'langchain/document'; import { BaseDocumentLoader } from 'langchain/document_loaders'; import type { DocumentLoader } from 'langchain/document_loaders'; import { CheerioWebBaseLoader } from 'langchain/document_loaders'; export class CustomWebLoader extends BaseDocumentLoader implements DocumentLoader { constructor(public webPath: string) { super(); } static async _scrape(url: string): Promise { const { load } = await CustomWebLoader.imports(); const response = await fetch(url); const html = await response.text(); return load(html); } async scrape(): Promise { return CustomWebLoader._scrape(this.webPath); } async load(): Promise { const $ = await this.scrape(); const title = $('h1.entry-title').text(); const date = $('meta[property="article:published_time"]').attr('content'); const content = $('.entry-content') .clone() .find('div.elementor, style') .remove() .end() .text(); const cleanedContent = content.replace(/\s+/g, ' ').trim(); const contentLength = cleanedContent?.match(/\b\w+\b/g)?.length ?? 0; const metadata = { source: this.webPath, title, date, contentLength }; return [new Document({ pageContent: cleanedContent, metadata })]; } static async imports(): Promise<{ load: typeof LoadT; }> { try { const { load } = await import('cheerio'); return { load }; } catch (e) { console.error(e); throw new Error( 'Please install cheerio as a dependency with, e.g. `yarn add cheerio`', ); } } }