-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
38 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,49 @@ | ||
# -*- coding: utf-8 -*- | ||
import urllib.parse as urlparse | ||
from typing import List | ||
from urllib.parse import parse_qs | ||
import xml.etree.ElementTree as ET | ||
|
||
import scrapy | ||
from scrapy.shell import inspect_response | ||
from jedeschule.items import School | ||
from scrapy import Item | ||
import re | ||
|
||
|
||
class BerlinSpider(scrapy.Spider): | ||
name = "berlin" | ||
# Potential errors of Berlin: | ||
# 502 with user agent = default (scrapy) -> use a real user agent like "jedeschule" | ||
# 429 with download delay = default -> set download delay to slow down scrapy | ||
# custom settings avoid other spiders from being affected of solving a spider individual problem | ||
custom_settings = {'USER_AGENT': 'jedeschule (open data project)', 'DOWNLOAD_DELAY': 1,} | ||
base_url = 'https://www.bildung.berlin.de/Schulverzeichnis/' | ||
start_url = base_url + 'SchulListe.aspx' | ||
start_urls = [start_url] | ||
start_urls = ['https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt/s_schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:25833&typename=fis:s_schulen'] | ||
|
||
def parse(self, response): | ||
schools = response.css('td a::attr(href)').extract() | ||
for i, school in enumerate(schools): | ||
# school = school.replace(' ','') | ||
yield scrapy.Request(self.base_url + school, callback=self.parse_detail, meta={'cookiejar': i}) | ||
|
||
def parse_detail(self, response): | ||
meta = {} | ||
name = response.css('#ContentPlaceHolderMenuListe_lblSchulname::text').extract_first().strip()#.rsplit('-', 1) | ||
meta['name'] = self.fix_data(name) | ||
meta['id'] = self._parse_school_no(response.url) | ||
meta['address'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblStrasse::text').extract_first()) | ||
meta['zip'], meta['city'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblOrt::text').extract_first()).split(" ", 1) | ||
schooltype = re.split('[()]', response.css('#ContentPlaceHolderMenuListe_lblSchulart::text').extract_first()) | ||
meta['schooltype'] = self.fix_data(schooltype[0].strip()) | ||
meta['legal_status'] = self.fix_data(schooltype[1].strip()) | ||
meta['telephone'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblTelefon::text').extract_first()) | ||
meta['fax'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblFax::text').extract_first()) | ||
meta['mail'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_HLinkEMail::text').extract_first()) | ||
meta['web'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_HLinkWeb::attr(href)').extract_first()) | ||
headmaster = response.css('#ContentPlaceHolderMenuListe_lblLeitung::text').extract_first() | ||
if headmaster: | ||
meta['headmaster'] = self.fix_data(' '.join(headmaster.split(',')[::-1]).strip()) | ||
meta['cookiejar'] = response.meta['cookiejar'] | ||
meta['data_url'] = response.url | ||
activities = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblAGs::text').extract_first()) | ||
if activities: | ||
meta['activities'] = [x.strip() for x in activities.split(';')] | ||
partner = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblPartner::text').extract_first()) | ||
if partner: | ||
meta['partner'] = [x.strip() for x in partner.split(';')] | ||
yield meta | ||
|
||
def _parse_school_no(self, url): | ||
"""Parses the school number from the 'IDSchulzweig' parameter in the url""" | ||
parsed = urlparse.urlparse(url) | ||
id_in_url: List[str] = parse_qs(parsed.query, max_num_fields=1)['IDSchulzweig'] | ||
assert len(id_in_url) == 1 | ||
|
||
return id_in_url[0].strip() | ||
|
||
# fix wrong tabs, spaces and new lines | ||
def fix_data(self, string): | ||
if string: | ||
string = ' '.join(string.split()) | ||
string.replace('\n', '') | ||
string.replace('\t', '') | ||
return string | ||
|
||
def normalize(self, item: Item) -> School: | ||
tree = ET.fromstring(response.body) | ||
|
||
namespaces = { | ||
"gml": "http://www.opengis.net/gml", | ||
"fis": "http://www.berlin.de/broker", | ||
} | ||
for school in tree.findall("gml:featureMember", namespaces): | ||
data_elem = {} | ||
for entry in school[0]: | ||
if entry.tag == "{http://www.berlin.de/broker}geom": | ||
# This nested entry contains the coordinates that we would like to expand | ||
lat, lon = entry.findtext( | ||
"gml:Point/gml:pos", namespaces=namespaces | ||
).split(" ") | ||
data_elem["lat"] = lat | ||
data_elem["lon"] = lon | ||
continue | ||
# strip the namespace before returning | ||
data_elem[entry.tag.split("}", 1)[1]] = entry.text | ||
yield data_elem | ||
|
||
@staticmethod | ||
def normalize(item: Item) -> School: | ||
return School(name=item.get('name'), | ||
id='BE-{}'.format(item.get('id')), | ||
address=item.get('address'), | ||
zip=item.get('zip'), | ||
city=item.get('city'), | ||
website=item.get('web'), | ||
email=item.get('mail'), | ||
school_type=item.get('schooltype'), | ||
id='BE-{}'.format(item.get('bsn')), | ||
address=" ".join([item.get('strasse'), item.get('hausnr')]), | ||
zip=item.get('plz'), | ||
city='Berlin', | ||
website=item.get('internet'), | ||
email=item.get('email'), | ||
school_type=item.get('schulart'), | ||
legal_status=item.get('traeger'), | ||
fax=item.get('fax'), | ||
phone=item.get('telephone'), | ||
director=item.get('headmaster'), | ||
legal_status=item.get('legal_status')) | ||
phone=item.get('telefon'), | ||
latitude=item.get('lat'), | ||
longitude=item.get('lon') | ||
) |