Tutti Fajly Actor
Pricing
Pay per usage
Go to Store

Tutti Fajly Actor
This actor was created just to test multifile support
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
1
Runs succeeded
>99%
Last modified
4 years ago
src/MM6.exe
Downloadsrc/crawler.py
1import scrapy2import re3
4
5class ChampionsLeagueSpider(scrapy.Spider):6 name = 'championsleaguespider'7 start_urls = ['https://www.uefa.com/uefachampionsleague/season=2018/clubs/']8
9 custom_settings = {10 'USER_AGENT': 'DDWcrawler',11 'ROBOTSTXT_OBEY': True,12 'DOWNLOAD_DELAY': 0.001,13 }14
15 # Parse statistics data of the current season16 def parse_phase(self, fields):17 phase = {}18 for field in fields:19 label = field.css('.statistics--list--label::text').extract_first()20
21 # Passing accuracy field22 if (label == 'passing accuracy'):23 value = field.css('.graph-circle--additional-text::text').extract_first()24
25 # Cards field26 elif (label == 'Cards'):27 cards = field.xpath('.//span[@class="statistics--list--data"]/text()').extract()28 labels = field.css('.statistics--list--data img::attr(title)').extract()29
30 yellow = re.findall('\d+', cards[0])31 red = re.findall('\d+', cards[1])32
33 value = {34 labels[0].lower().replace(" ", "-"): yellow[0],35 labels[1].lower().replace(" ", "-"): red[0],36 }37
38 # Passing types field39 elif (label == 'Passing types'):40 types = field.css('.statistics--list--data .graph-bar-container .bar-container')41 value = {}42
43 for type in types:44 pass_type = type.css('span:not(.bar)::text').extract_first()45 type_label = pass_type.split(' ')[1].lower()46 type_value = type.css('span:not(.bar) b::text').extract_first() + ' ' + \47 pass_type.split(' ')[2]48 value[type_label] = type_value49
50 # Goal types field51 elif (label == 'Type of goal'):52 types = field.css('.statistics--list--data .graph-dummy-container > div')53 value = {}54 for type in types:55 type_label = type.xpath('.//span/text()').extract()56 type_value = type.css('div > span::text').extract_first()57 value[type_label[1].lower().replace(" ", "-")] = type_value58
59 # Standard fields60 else:61 value = field.css('.statistics--list--data::text').extract_first()62
63 phase[label.lower().replace(" ", "-")] = value64
65 return phase66
67 # Collect statistics data and split it into phases68 def parse_stats(self, response):69 player = response.meta['player']70 stats_sections = response.css('.player--statistics--list')71 player['matches'] = {}72
73 for section in stats_sections:74 fields = section.css('.field')75 phase_name = section.css('.stats-header::text').extract_first()76
77 # Tournament phase78 if (phase_name == 'Tournament phase'):79 player['matches']['tournament'] = self.parse_phase(fields)80
81 # Qualification phase82 elif (phase_name == 'Qualifying'):83 player['matches']['qualification'] = self.parse_phase(fields)84
85 yield player86
87 # Visit player's profile page, collect biography data and get a link to statistics page88 def parse_player(self, response):89 stats_url = response.css(90 '.content-wrap .section--footer a[title="More statistics"]::attr(href)').extract_first()91
92 for player in response.xpath('//div[@class="content-wrap"]'):93 bio = {94 'name': player.css('.player-header_name::text').extract_first(),95 'position': player.css('.player-header_category::text').extract_first(),96 'team': player.css('.player-header_team-name::text').extract_first(),97 'nationality': player.css('.player-header_country::text').extract_first(),98 'birthdate': player.css('.profile--list--data[itemprop=birthdate]::text').extract_first().split(' ')[0],99 'height': player.css('.profile--list--data[itemprop=height]::text').extract_first(),100 'weight': player.css('.profile--list--data[itemprop=weight]::text').extract_first(),101 }102
103 if (stats_url):104 yield scrapy.Request(response.urljoin(stats_url), callback=self.parse_stats, meta={'player': bio})105 else:106 yield bio107
108 # Visit page of each club and collect links to players profiles109 def parse_clubs(self, response):110 players = response.css('#team-data .squad--team-player')111 players_urls = players.css('.squad--player-name > a::attr(href)').extract()112
113 for player_url in players_urls:114 yield scrapy.Request(response.urljoin(player_url), callback=self.parse_player)115
116 # Get a list of clubs from the starting page117 def parse(self, response):118 clubs = response.css('.teams-overview_group .team > a')119
120 clubs_tocrawl = clubs.css('::attr(href)').extract()121
122 for club_url in clubs_tocrawl:123 yield scrapy.Request(response.urljoin(club_url), callback=self.parse_clubs)
Dockerfile
FROM apify/actor-node-basic# Copy source codeCOPY . ./#COPY src ./srcRUN npm --quiet set progress=false \&& npm install --only=prod --no-optional \&& echo "Installed NPM packages:" \&& npm list \&& echo "Node.js version:" \&& node --version \&& echo "NPM version:" \&& npm --version
Overview.jpg
main.js
1const Apify = require('apify');2const request = require('request-promise');3
4Apify.main(async () => {5 // Get input of your actor6 const input = await Apify.getValue('INPUT');7 console.log('My input:');8 console.dir(input);9
10 // Do something useful here11 const html = await request('http://www.example.com');12
13 // And then save output14 const output = {15 html,16 crawledAt: new Date(),17 };18 console.log('My output:');19 console.dir(output);20 await Apify.setValue('OUTPUT', output);21});
package.json
{ "name": "actors-name", "version": "0.0.1", "description": "Actors description", "main": "main.js", "dependencies": { "apify": "^0.11.5", "request-promise": "^4.2.2" }, "scripts": { "start": "node main.js" }, "author": "Actor creators name"}
pokus.csv
1something;this;3;2something;that;5;