scrape-embed.ts 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import { Document } from 'langchain/document';
  2. import * as fs from 'fs/promises';
  3. import { CustomWebLoader } from '@/utils/custom_web_loader';
  4. import type { SupabaseClient } from '@supabase/supabase-js';
  5. import { Embeddings, OpenAIEmbeddings } from 'langchain/embeddings';
  6. import { SupabaseVectorStore } from 'langchain/vectorstores';
  7. import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
  8. import { supabaseClient } from '@/utils/supabase-client';
  9. import { urls } from '@/config/notionurls';
  10. async function extractDataFromUrl(url: string): Promise<Document[]> {
  11. try {
  12. const loader = new CustomWebLoader(url);
  13. const docs = await loader.load();
  14. return docs;
  15. } catch (error) {
  16. console.error(`Error while extracting data from ${url}: ${error}`);
  17. return [];
  18. }
  19. }
  20. async function extractDataFromUrls(urls: string[]): Promise<Document[]> {
  21. console.log('extracting data from urls...');
  22. const documents: Document[] = [];
  23. for (const url of urls) {
  24. const docs = await extractDataFromUrl(url);
  25. documents.push(...docs);
  26. }
  27. console.log('data extracted from urls');
  28. const json = JSON.stringify(documents);
  29. await fs.writeFile('franknotion.json', json);
  30. console.log('json file containing data saved on disk');
  31. return documents;
  32. }
  33. async function embedDocuments(
  34. client: SupabaseClient,
  35. docs: Document[],
  36. embeddings: Embeddings,
  37. ) {
  38. console.log('creating embeddings...');
  39. await SupabaseVectorStore.fromDocuments(client, docs, embeddings);
  40. console.log('embeddings successfully stored in supabase');
  41. }
  42. async function splitDocsIntoChunks(docs: Document[]): Promise<Document[]> {
  43. const textSplitter = new RecursiveCharacterTextSplitter({
  44. chunkSize: 2000,
  45. chunkOverlap: 200,
  46. });
  47. return await textSplitter.splitDocuments(docs);
  48. }
  49. (async function run(urls: string[]) {
  50. try {
  51. //load data from each url
  52. const rawDocs = await extractDataFromUrls(urls);
  53. //split docs into chunks for openai context window
  54. const docs = await splitDocsIntoChunks(rawDocs);
  55. //embed docs into supabase
  56. await embedDocuments(supabaseClient, docs, new OpenAIEmbeddings());
  57. } catch (error) {
  58. console.log('error occured:', error);
  59. }
  60. })(urls);