custom_web_loader.ts 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import type { CheerioAPI, load as LoadT } from 'cheerio';
  2. import { Document } from 'langchain/document';
  3. import { BaseDocumentLoader } from 'langchain/document_loaders';
  4. import type { DocumentLoader } from 'langchain/document_loaders';
  5. import { CheerioWebBaseLoader } from 'langchain/document_loaders';
  6. export class CustomWebLoader
  7. extends BaseDocumentLoader
  8. implements DocumentLoader
  9. {
  10. constructor(public webPath: string) {
  11. super();
  12. }
  13. static async _scrape(url: string): Promise<CheerioAPI> {
  14. const { load } = await CustomWebLoader.imports();
  15. const response = await fetch(url);
  16. const html = await response.text();
  17. return load(html);
  18. }
  19. async scrape(): Promise<CheerioAPI> {
  20. return CustomWebLoader._scrape(this.webPath);
  21. }
  22. async load(): Promise<Document[]> {
  23. const $ = await this.scrape();
  24. const title = $('h1.entry-title').text();
  25. const date = $('meta[property="article:published_time"]').attr('content');
  26. const content = $('.entry-content')
  27. .clone()
  28. .find('div.elementor, style')
  29. .remove()
  30. .end()
  31. .text();
  32. const cleanedContent = content.replace(/\s+/g, ' ').trim();
  33. const contentLength = cleanedContent?.match(/\b\w+\b/g)?.length ?? 0;
  34. const metadata = { source: this.webPath, title, date, contentLength };
  35. return [new Document({ pageContent: cleanedContent, metadata })];
  36. }
  37. static async imports(): Promise<{
  38. load: typeof LoadT;
  39. }> {
  40. try {
  41. const { load } = await import('cheerio');
  42. return { load };
  43. } catch (e) {
  44. console.error(e);
  45. throw new Error(
  46. 'Please install cheerio as a dependency with, e.g. `yarn add cheerio`',
  47. );
  48. }
  49. }
  50. }