๐ฆ๏ธ๐ LangChain
Example of how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model.
src/main.js
src/vector_index_cache.js
1import { rm } from 'node:fs/promises';
2
3import { ApifyDatasetLoader } from '@langchain/community/document_loaders/web/apify_dataset';
4import { HNSWLib } from '@langchain/community/vectorstores/hnswlib';
5import { ChatPromptTemplate } from '@langchain/core/prompts';
6import { OpenAI, OpenAIEmbeddings } from '@langchain/openai';
7import { Actor, log } from 'apify';
8import { createStuffDocumentsChain } from 'langchain/chains/combine_documents';
9import { createRetrievalChain } from 'langchain/chains/retrieval';
10import { Document } from 'langchain/document';
11
12// This is ESM project, and as such, it requires you to specify extensions in your relative imports.
13// Read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
14import { retrieveVectorIndex, cacheVectorIndex } from './vector_index_cache.js';
15
16await Actor.init();
17
18// Follow these steps to run this template:
19// 1. If running locally, authenticate to the Apify platform by executing `apify login` in your terminal.
20// This is necessary to run the Website Content Crawler Actor for data gathering.
21// 2. Set the `OPENAI_API_KEY` environment variable with your OpenAI API key, which can be obtained from
22// https://platform.openai.com/account/api-keys. Refer to
23// https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console for guidance
24// on setting environment variables.
25const { OPENAI_API_KEY, APIFY_TOKEN } = process.env;
26
27// You can configure the input for the Actor in the Apify UI when running on the Apify platform or editing
28// storage/key_value_stores/default/INPUT.json when running locally.
29const {
30 startUrls = [{ url: 'https://wikipedia.com' }],
31 maxCrawlPages = 3,
32 forceRecrawl = false, // Enforce a re-crawl of website content and re-creation of the vector index.
33 query = 'What is Wikipedia?',
34 openAIApiKey = OPENAI_API_KEY, // This is a fallback to the OPENAI_API_KEY environment variable when value is not present in the input.
35} = await Actor.getInput() || {};
36
37// Local directory where the vector index will be stored.
38const VECTOR_INDEX_PATH = './vector_index';
39
40const prompt = ChatPromptTemplate.fromTemplate(
41 `Answer the user's question: {input} based on the following context {context}`,
42);
43
44if (!openAIApiKey) throw new Error('Please configure the OPENAI_API_KEY as environment variable or enter it into the input!');
45if (!APIFY_TOKEN) throw new Error('Please configure the APIFY_TOKEN environment variable! Call `apify login` in your terminal to authenticate.');
46
47// Now we want to create a vector index from the crawled documents.
48// Following object represents an input for the https://apify.com/apify/website-content-crawler actor that crawls the website to gather the data.
49const websiteContentCrawlerInput = { startUrls, maxCrawlPages };
50
51// This variable will contain a vector index that we will use to retrieve the most relevant documents for a given query.
52let vectorStore;
53
54// First, we check if the vector index is already cached. If not, we run the website content crawler to get the documents.
55// By setting up forceRecrawl=true you can enforce a re-scrape of the website content and re-creation of the vector index.
56log.info('Fetching cached vector index from key-value store...');
57const reinitializeIndex = forceRecrawl || !(await retrieveVectorIndex(websiteContentCrawlerInput));
58if (reinitializeIndex) {
59 // Run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader.
60 log.info('Vector index was not found.');
61 log.info('Running apify/website-content-crawler to gather the data...');
62 const loader = await ApifyDatasetLoader.fromActorCall(
63 'apify/website-content-crawler',
64 websiteContentCrawlerInput,
65 {
66 datasetMappingFunction: (item) => new Document({
67 pageContent: (item.text || ''),
68 metadata: { source: item.url },
69 }),
70 clientOptions: { token: APIFY_TOKEN },
71 },
72 );
73
74 // Initialize the vector index from the crawled documents.
75 log.info('Feeding vector index with crawling results...');
76 const docs = await loader.load();
77 vectorStore = await HNSWLib.fromDocuments(
78 docs,
79 new OpenAIEmbeddings({ openAIApiKey }),
80 );
81
82 // Save the vector index to the key-value store so that we can skip this phase in the next run.
83 log.info('Saving vector index to the disk...');
84 await vectorStore.save(VECTOR_INDEX_PATH);
85 await cacheVectorIndex(websiteContentCrawlerInput, VECTOR_INDEX_PATH);
86}
87
88// Load the vector index from the disk if not already initialized above.
89if (!vectorStore) {
90 log.info('Initializing the vector store...');
91 vectorStore = await HNSWLib.load(VECTOR_INDEX_PATH, new OpenAIEmbeddings({ openAIApiKey }));
92}
93
94// Next, create the retrieval chain and enter a query:
95const llm = new OpenAI({ openAIApiKey });
96const combineDocsChain = await createStuffDocumentsChain({
97 llm,
98 prompt,
99});
100
101const chain = await createRetrievalChain({
102 combineDocsChain,
103 retriever: vectorStore.asRetriever(),
104 returnSourceDocuments: true,
105});
106
107log.info('Asking model a question...');
108const res = await chain.invoke({ input: query });
109
110log.info(`Question: ${query}`);
111log.info(`Model response: ${res.answer}`);
112
113// Remove the vector index directory as we have it cached in the key-value store for the next time.
114await rm(VECTOR_INDEX_PATH, { recursive: true });
115
116await Actor.setValue('OUTPUT', res);
117await Actor.exit();
LangChain.js template
LangChain is a framework for developing applications powered by language models.
This example template illustrates how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model. All of this is within a single Apify Actor and slightly over a hundred lines of code.
Included features
- Apify SDK - a toolkit for building Actors
- Input schema - define and easily validate a schema for your actor's input
- Langchain.js - a framework for developing applications powered by language models
- OpenAI - a powerful language model
How it works
The code contains the following steps:
- Crawls given website using Website Content Crawler Actor.
- Vectorizes the data using the OpenAI API.
- Caches the vector index in the key-value store so that when you run Actor for the same website again, the cached data are used to speed it up.
- Data are fed to the OpenAI model using Langchain.js, and a given query is asked.
Before you start
To be able to run this template both locally and on the Apify platform, you need to:
- Have an Apify account and sign into it using
apify login
command in your terminal. Without this, you won't be able to run the required Website Content Crawler Actor to gather the data. - Have an OpenAI account and an API key. This is needed for vectorizing the data and also to be able to prompt the OpenAI model.
- When running locally store this as OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console).
- When running on Apify platform, you can simply paste this into the input field in the input UI.
Production use
This serves purely as an example of the whole pipeline.
For production use, we recommend you to:
- Separate crawling, data vectorization, and prompting into separate Actors. This way, you can run them independently and scale them separately.
- Replace the local vector store with Pinecone or a similar database. See the LangChain.js docs for more information.
Resources
- Pinecone integration Actor
- How to use Pinecone with LLMs
- How to use LangChain with OpenAI, Pinecone, and Apify
- Integration with Zapier, Make, Google Drive and others
- Video guide on getting data using Apify API
- A short guide on how to create web scrapers using code templates
Scrape single page with provided URL with Axios and extract data from page's HTML with Cheerio.
A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.
Example of a Puppeteer and headless Chrome web scraper. Headless browsers render JavaScript and are harder to block, but they're slower than plain HTTP.
Web scraper example with Crawlee, Playwright and headless Chrome. Playwright is more modern, user-friendly and harder to block than Puppeteer.
Skeleton project that helps you quickly bootstrap `CheerioCrawler` in JavaScript. It's best for developers who already know Apify SDK and Crawlee.
Example of running Cypress tests and saving their results on the Apify platform. JSON results are saved to Dataset, videos to Key-value store.