Tutti Fajly Actor
Try for free
No credit card required
Go to Store
Tutti Fajly Actor
vratous/tutti-fajly-actor
Try for free
No credit card required
This actor was created just to test multifile support
src/MM6.exe
Downloadsrc/crawler.py
1import scrapy
2import re
3
4
5class ChampionsLeagueSpider(scrapy.Spider):
6 name = 'championsleaguespider'
7 start_urls = ['https://www.uefa.com/uefachampionsleague/season=2018/clubs/']
8
9 custom_settings = {
10 'USER_AGENT': 'DDWcrawler',
11 'ROBOTSTXT_OBEY': True,
12 'DOWNLOAD_DELAY': 0.001,
13 }
14
15 # Parse statistics data of the current season
16 def parse_phase(self, fields):
17 phase = {}
18 for field in fields:
19 label = field.css('.statistics--list--label::text').extract_first()
20
21 # Passing accuracy field
22 if (label == 'passing accuracy'):
23 value = field.css('.graph-circle--additional-text::text').extract_first()
24
25 # Cards field
26 elif (label == 'Cards'):
27 cards = field.xpath('.//span[@class="statistics--list--data"]/text()').extract()
28 labels = field.css('.statistics--list--data img::attr(title)').extract()
29
30 yellow = re.findall('\d+', cards[0])
31 red = re.findall('\d+', cards[1])
32
33 value = {
34 labels[0].lower().replace(" ", "-"): yellow[0],
35 labels[1].lower().replace(" ", "-"): red[0],
36 }
37
38 # Passing types field
39 elif (label == 'Passing types'):
40 types = field.css('.statistics--list--data .graph-bar-container .bar-container')
41 value = {}
42
43 for type in types:
44 pass_type = type.css('span:not(.bar)::text').extract_first()
45 type_label = pass_type.split(' ')[1].lower()
46 type_value = type.css('span:not(.bar) b::text').extract_first() + ' ' + \
47 pass_type.split(' ')[2]
48 value[type_label] = type_value
49
50 # Goal types field
51 elif (label == 'Type of goal'):
52 types = field.css('.statistics--list--data .graph-dummy-container > div')
53 value = {}
54 for type in types:
55 type_label = type.xpath('.//span/text()').extract()
56 type_value = type.css('div > span::text').extract_first()
57 value[type_label[1].lower().replace(" ", "-")] = type_value
58
59 # Standard fields
60 else:
61 value = field.css('.statistics--list--data::text').extract_first()
62
63 phase[label.lower().replace(" ", "-")] = value
64
65 return phase
66
67 # Collect statistics data and split it into phases
68 def parse_stats(self, response):
69 player = response.meta['player']
70 stats_sections = response.css('.player--statistics--list')
71 player['matches'] = {}
72
73 for section in stats_sections:
74 fields = section.css('.field')
75 phase_name = section.css('.stats-header::text').extract_first()
76
77 # Tournament phase
78 if (phase_name == 'Tournament phase'):
79 player['matches']['tournament'] = self.parse_phase(fields)
80
81 # Qualification phase
82 elif (phase_name == 'Qualifying'):
83 player['matches']['qualification'] = self.parse_phase(fields)
84
85 yield player
86
87 # Visit player's profile page, collect biography data and get a link to statistics page
88 def parse_player(self, response):
89 stats_url = response.css(
90 '.content-wrap .section--footer a[title="More statistics"]::attr(href)').extract_first()
91
92 for player in response.xpath('//div[@class="content-wrap"]'):
93 bio = {
94 'name': player.css('.player-header_name::text').extract_first(),
95 'position': player.css('.player-header_category::text').extract_first(),
96 'team': player.css('.player-header_team-name::text').extract_first(),
97 'nationality': player.css('.player-header_country::text').extract_first(),
98 'birthdate': player.css('.profile--list--data[itemprop=birthdate]::text').extract_first().split(' ')[0],
99 'height': player.css('.profile--list--data[itemprop=height]::text').extract_first(),
100 'weight': player.css('.profile--list--data[itemprop=weight]::text').extract_first(),
101 }
102
103 if (stats_url):
104 yield scrapy.Request(response.urljoin(stats_url), callback=self.parse_stats, meta={'player': bio})
105 else:
106 yield bio
107
108 # Visit page of each club and collect links to players profiles
109 def parse_clubs(self, response):
110 players = response.css('#team-data .squad--team-player')
111 players_urls = players.css('.squad--player-name > a::attr(href)').extract()
112
113 for player_url in players_urls:
114 yield scrapy.Request(response.urljoin(player_url), callback=self.parse_player)
115
116 # Get a list of clubs from the starting page
117 def parse(self, response):
118 clubs = response.css('.teams-overview_group .team > a')
119
120 clubs_tocrawl = clubs.css('::attr(href)').extract()
121
122 for club_url in clubs_tocrawl:
123 yield scrapy.Request(response.urljoin(club_url), callback=self.parse_clubs)
Dockerfile
1FROM apify/actor-node-basic
2# Copy source code
3COPY . ./
4#COPY src ./src
5RUN npm --quiet set progress=false \
6&& npm install --only=prod --no-optional \
7&& echo "Installed NPM packages:" \
8&& npm list \
9&& echo "Node.js version:" \
10&& node --version \
11&& echo "NPM version:" \
12&& npm --version
Overview.jpg
main.js
1const Apify = require('apify');
2const request = require('request-promise');
3
4Apify.main(async () => {
5 // Get input of your actor
6 const input = await Apify.getValue('INPUT');
7 console.log('My input:');
8 console.dir(input);
9
10 // Do something useful here
11 const html = await request('http://www.example.com');
12
13 // And then save output
14 const output = {
15 html,
16 crawledAt: new Date(),
17 };
18 console.log('My output:');
19 console.dir(output);
20 await Apify.setValue('OUTPUT', output);
21});
package.json
1{
2 "name": "actors-name",
3 "version": "0.0.1",
4 "description": "Actors description",
5 "main": "main.js",
6 "dependencies": {
7 "apify": "^0.11.5",
8 "request-promise": "^4.2.2"
9 },
10 "scripts": {
11 "start": "node main.js"
12 },
13 "author": "Actor creators name"
14}
pokus.csv
1something;this;3;
2something;that;5;
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
1 star
>99% runs succeeded
Created in Feb 2019
Modified 3 years ago
Categories