Tutti Fajly Actor avatar
Tutti Fajly Actor

Pricing

Pay per usage

Go to Store
Tutti Fajly Actor

Tutti Fajly Actor

Developed by

Vratislav Bartonicek

Vratislav Bartonicek

Maintained by Community

This actor was created just to test multifile support

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

1

Runs succeeded

>99%

Last modified

4 years ago

src/MM6.exe

Download

src/crawler.py

1import scrapy
2import re
3
4
5class ChampionsLeagueSpider(scrapy.Spider):
6 name = 'championsleaguespider'
7 start_urls = ['https://www.uefa.com/uefachampionsleague/season=2018/clubs/']
8
9 custom_settings = {
10 'USER_AGENT': 'DDWcrawler',
11 'ROBOTSTXT_OBEY': True,
12 'DOWNLOAD_DELAY': 0.001,
13 }
14
15 # Parse statistics data of the current season
16 def parse_phase(self, fields):
17 phase = {}
18 for field in fields:
19 label = field.css('.statistics--list--label::text').extract_first()
20
21 # Passing accuracy field
22 if (label == 'passing accuracy'):
23 value = field.css('.graph-circle--additional-text::text').extract_first()
24
25 # Cards field
26 elif (label == 'Cards'):
27 cards = field.xpath('.//span[@class="statistics--list--data"]/text()').extract()
28 labels = field.css('.statistics--list--data img::attr(title)').extract()
29
30 yellow = re.findall('\d+', cards[0])
31 red = re.findall('\d+', cards[1])
32
33 value = {
34 labels[0].lower().replace(" ", "-"): yellow[0],
35 labels[1].lower().replace(" ", "-"): red[0],
36 }
37
38 # Passing types field
39 elif (label == 'Passing types'):
40 types = field.css('.statistics--list--data .graph-bar-container .bar-container')
41 value = {}
42
43 for type in types:
44 pass_type = type.css('span:not(.bar)::text').extract_first()
45 type_label = pass_type.split(' ')[1].lower()
46 type_value = type.css('span:not(.bar) b::text').extract_first() + ' ' + \
47 pass_type.split(' ')[2]
48 value[type_label] = type_value
49
50 # Goal types field
51 elif (label == 'Type of goal'):
52 types = field.css('.statistics--list--data .graph-dummy-container > div')
53 value = {}
54 for type in types:
55 type_label = type.xpath('.//span/text()').extract()
56 type_value = type.css('div > span::text').extract_first()
57 value[type_label[1].lower().replace(" ", "-")] = type_value
58
59 # Standard fields
60 else:
61 value = field.css('.statistics--list--data::text').extract_first()
62
63 phase[label.lower().replace(" ", "-")] = value
64
65 return phase
66
67 # Collect statistics data and split it into phases
68 def parse_stats(self, response):
69 player = response.meta['player']
70 stats_sections = response.css('.player--statistics--list')
71 player['matches'] = {}
72
73 for section in stats_sections:
74 fields = section.css('.field')
75 phase_name = section.css('.stats-header::text').extract_first()
76
77 # Tournament phase
78 if (phase_name == 'Tournament phase'):
79 player['matches']['tournament'] = self.parse_phase(fields)
80
81 # Qualification phase
82 elif (phase_name == 'Qualifying'):
83 player['matches']['qualification'] = self.parse_phase(fields)
84
85 yield player
86
87 # Visit player's profile page, collect biography data and get a link to statistics page
88 def parse_player(self, response):
89 stats_url = response.css(
90 '.content-wrap .section--footer a[title="More statistics"]::attr(href)').extract_first()
91
92 for player in response.xpath('//div[@class="content-wrap"]'):
93 bio = {
94 'name': player.css('.player-header_name::text').extract_first(),
95 'position': player.css('.player-header_category::text').extract_first(),
96 'team': player.css('.player-header_team-name::text').extract_first(),
97 'nationality': player.css('.player-header_country::text').extract_first(),
98 'birthdate': player.css('.profile--list--data[itemprop=birthdate]::text').extract_first().split(' ')[0],
99 'height': player.css('.profile--list--data[itemprop=height]::text').extract_first(),
100 'weight': player.css('.profile--list--data[itemprop=weight]::text').extract_first(),
101 }
102
103 if (stats_url):
104 yield scrapy.Request(response.urljoin(stats_url), callback=self.parse_stats, meta={'player': bio})
105 else:
106 yield bio
107
108 # Visit page of each club and collect links to players profiles
109 def parse_clubs(self, response):
110 players = response.css('#team-data .squad--team-player')
111 players_urls = players.css('.squad--player-name > a::attr(href)').extract()
112
113 for player_url in players_urls:
114 yield scrapy.Request(response.urljoin(player_url), callback=self.parse_player)
115
116 # Get a list of clubs from the starting page
117 def parse(self, response):
118 clubs = response.css('.teams-overview_group .team > a')
119
120 clubs_tocrawl = clubs.css('::attr(href)').extract()
121
122 for club_url in clubs_tocrawl:
123 yield scrapy.Request(response.urljoin(club_url), callback=self.parse_clubs)

Dockerfile

FROM apify/actor-node-basic
# Copy source code
COPY . ./
#COPY src ./src
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& npm list \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version

Overview.jpg

main.js

1const Apify = require('apify');
2const request = require('request-promise');
3
4Apify.main(async () => {
5 // Get input of your actor
6 const input = await Apify.getValue('INPUT');
7 console.log('My input:');
8 console.dir(input);
9
10 // Do something useful here
11 const html = await request('http://www.example.com');
12
13 // And then save output
14 const output = {
15 html,
16 crawledAt: new Date(),
17 };
18 console.log('My output:');
19 console.dir(output);
20 await Apify.setValue('OUTPUT', output);
21});

package.json

{
"name": "actors-name",
"version": "0.0.1",
"description": "Actors description",
"main": "main.js",
"dependencies": {
"apify": "^0.11.5",
"request-promise": "^4.2.2"
},
"scripts": {
"start": "node main.js"
},
"author": "Actor creators name"
}

pokus.csv

1something;this;3;
2something;that;5;