# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-3",
    "title": "Scrape single page in Python",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "meta": {
        "templateId": "python-start"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.apify.com/"
        }
    },
    "required": ["url"]
}

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at
11# https://www.crummy.com/software/BeautifulSoup/bs4/doc
12from bs4 import BeautifulSoup
13
14# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/
15from httpx import AsyncClient
16
17# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python
18from apify import Actor
19
20
21async def main() -> None:
22    """
23    The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
24    out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
25    and it also enhances performance in the field of web scraping significantly.
26    """
27    async with Actor:
28        # Structure of input is defined in input_schema.json
29        actor_input = await Actor.get_input() or {}
30        url = actor_input.get('url')
31
32        # Create an asynchronous HTTPX client
33        async with AsyncClient() as client:
34            # Fetch the HTML content of the page.
35            response = await client.get(url, follow_redirects=True)
36
37        # Parse the HTML content using Beautiful Soup
38        soup = BeautifulSoup(response.content, 'html.parser')
39
40        # Extract all headings from the page (tag name and text)
41        headings = []
42        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
43            heading_object = {'level': heading.name, 'text': heading.text}
44            Actor.log.info(f'Extracted heading: {heading_object}')
45            headings.append(heading_object)
46
47        # Save headings to Dataset - a table-like storage
48        await Actor.push_data(headings)

My Actor

rezaczu/actor-name

Zuzana Řezáčová

My Actor

test-fest/my-actor

test-fest

4.1

PPR Actor

ruly_villa/ppr-actor

Zuzana Štětinová

4.5

PPR Actor

rezaczu/ppr-actor

Zuzana Řezáčová

PPE actor

ruly_villa/ppe-actor

Zuzana Štětinová

Actor 1

jkuzz/actor-1

Jan Kuželík

5.0

Python Example

apify/python-example

Example Actor written in Python, showing how to read the Actor input and push to the Actor's default dataset.

Apify Technologies

Start Actor

groovy_hyperbole/start-actor

Jan Novotny

Testing ppr actor

rezaczu/testing-ppr-actor

Zuzana Řezáčová

4.0

PPR testing actor

knowing_didgeridoo/ppr-testing-actor

Jan Novotny

# First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. FROM apify/actor-python:3.11 # Second, copy just requirements.txt into the Actor image, # since it should be the only file that affects the dependency install in the next step, # in order to speed up the build COPY requirements.txt ./ # Install the packages specified in requirements.txt, # Print the installed Python version, pip version # and all installed packages with their versions for debugging RUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze # Next, copy the remaining files and directories with the source code. # Since we do this after installing the dependencies, quick build will be really fast # for most source file changes. COPY . ./ # Use compileall to ensure the runnability of the Actor Python code. RUN python3 -m compileall -q . # Specify how to launch the source code of your Actor. # By default, the "python3 -m src" command is run CMD ["python3", "-m", "src"]

{ "actorSpecification": 1, "name": "my-actor-3", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile" }

{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com/" } }, "required": ["url"] }

# This file tells Git which files shouldn't be added to source control .idea .DS_Store apify_storage storage/* !storage/key_value_stores storage/key_value_stores/* !storage/key_value_stores/default storage/key_value_stores/default/* !storage/key_value_stores/default/INPUT.json .venv/ .env/ __pypackages__ dist/ build/ *.egg-info/ *.egg __pycache__ .mypy_cache .dmypy.json dmypy.json .pytest_cache .ruff_cache .scrapy *.log

1# Feel free to add your Python dependencies below. For formatting guidelines, see: 2# https://pip.pypa.io/en/latest/reference/requirements-file-format/ 3 4apify ~= 1.7.0 5beautifulsoup4 ~= 4.12.2 6httpx ~= 0.25.2 7types-beautifulsoup4 ~= 4.12.0.7

1""" 2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging 3settings. The `main()` coroutine is then executed using `asyncio.run()`. 4 5Feel free to modify this file to suit your specific needs. 6""" 7 8import asyncio 9import logging 10 11from apify.log import ActorLogFormatter 12 13from .main import main 14 15# Configure loggers 16handler = logging.StreamHandler() 17handler.setFormatter(ActorLogFormatter()) 18 19apify_client_logger = logging.getLogger('apify_client') 20apify_client_logger.setLevel(logging.INFO) 21apify_client_logger.addHandler(handler) 22 23apify_logger = logging.getLogger('apify') 24apify_logger.setLevel(logging.DEBUG) 25apify_logger.addHandler(handler) 26 27# Execute the Actor main coroutine 28asyncio.run(main())

1""" 2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file. 3 4Feel free to modify this file to suit your specific needs. 5 6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation: 7https://docs.apify.com/sdk/python 8""" 9 10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at 11# https://www.crummy.com/software/BeautifulSoup/bs4/doc 12from bs4 import BeautifulSoup 13 14# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/ 15from httpx import AsyncClient 16 17# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python 18from apify import Actor 19 20 21async def main() -> None: 22 """ 23 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function 24 out of it, it will not work. Asynchronous execution is required for communication with Apify platform, 25 and it also enhances performance in the field of web scraping significantly. 26 """ 27 async with Actor: 28 # Structure of input is defined in input_schema.json 29 actor_input = await Actor.get_input() or {} 30 url = actor_input.get('url') 31 32 # Create an asynchronous HTTPX client 33 async with AsyncClient() as client: 34 # Fetch the HTML content of the page. 35 response = await client.get(url, follow_redirects=True) 36 37 # Parse the HTML content using Beautiful Soup 38 soup = BeautifulSoup(response.content, 'html.parser') 39 40 # Extract all headings from the page (tag name and text) 41 headings = [] 42 for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): 43 heading_object = {'level': heading.name, 'text': heading.text} 44 Actor.log.info(f'Extracted heading: {heading_object}') 45 headings.append(heading_object) 46 47 # Save headings to Dataset - a table-like storage 48 await Actor.push_data(headings)

Actor to be moved

Actor to be moved

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

.dockerignore

.editorconfig

.gitignore

requirements.txt

src/__main__.py

src/main.py

You might also like

My Actor

My Actor

PPR Actor

PPR Actor

PPE actor

Actor 1

Python Example

Start Actor

Testing ppr actor

PPR testing actor

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

.dockerignore

.editorconfig

.gitignore

requirements.txt

src/__main__.py

src/main.py

src/main.py

src/main.py