Skip to content

Commit

Permalink
Use WFS for Berlin
Browse files Browse the repository at this point in the history
  • Loading branch information
k-nut committed Apr 29, 2024
1 parent 23976cf commit a8152d1
Showing 1 changed file with 38 additions and 73 deletions.
111 changes: 38 additions & 73 deletions jedeschule/spiders/berlin.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,49 @@
# -*- coding: utf-8 -*-
import urllib.parse as urlparse
from typing import List
from urllib.parse import parse_qs
import xml.etree.ElementTree as ET

import scrapy
from scrapy.shell import inspect_response
from jedeschule.items import School
from scrapy import Item
import re


class BerlinSpider(scrapy.Spider):
name = "berlin"
# Potential errors of Berlin:
# 502 with user agent = default (scrapy) -> use a real user agent like "jedeschule"
# 429 with download delay = default -> set download delay to slow down scrapy
# custom settings avoid other spiders from being affected of solving a spider individual problem
custom_settings = {'USER_AGENT': 'jedeschule (open data project)', 'DOWNLOAD_DELAY': 1,}
base_url = 'https://www.bildung.berlin.de/Schulverzeichnis/'
start_url = base_url + 'SchulListe.aspx'
start_urls = [start_url]
start_urls = ['https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt/s_schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&srsname=EPSG:25833&typename=fis:s_schulen']

def parse(self, response):
schools = response.css('td a::attr(href)').extract()
for i, school in enumerate(schools):
# school = school.replace(' ','')
yield scrapy.Request(self.base_url + school, callback=self.parse_detail, meta={'cookiejar': i})

def parse_detail(self, response):
meta = {}
name = response.css('#ContentPlaceHolderMenuListe_lblSchulname::text').extract_first().strip()#.rsplit('-', 1)
meta['name'] = self.fix_data(name)
meta['id'] = self._parse_school_no(response.url)
meta['address'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblStrasse::text').extract_first())
meta['zip'], meta['city'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblOrt::text').extract_first()).split(" ", 1)
schooltype = re.split('[()]', response.css('#ContentPlaceHolderMenuListe_lblSchulart::text').extract_first())
meta['schooltype'] = self.fix_data(schooltype[0].strip())
meta['legal_status'] = self.fix_data(schooltype[1].strip())
meta['telephone'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblTelefon::text').extract_first())
meta['fax'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblFax::text').extract_first())
meta['mail'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_HLinkEMail::text').extract_first())
meta['web'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_HLinkWeb::attr(href)').extract_first())
headmaster = response.css('#ContentPlaceHolderMenuListe_lblLeitung::text').extract_first()
if headmaster:
meta['headmaster'] = self.fix_data(' '.join(headmaster.split(',')[::-1]).strip())
meta['cookiejar'] = response.meta['cookiejar']
meta['data_url'] = response.url
activities = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblAGs::text').extract_first())
if activities:
meta['activities'] = [x.strip() for x in activities.split(';')]
partner = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblPartner::text').extract_first())
if partner:
meta['partner'] = [x.strip() for x in partner.split(';')]
yield meta

def _parse_school_no(self, url):
"""Parses the school number from the 'IDSchulzweig' parameter in the url"""
parsed = urlparse.urlparse(url)
id_in_url: List[str] = parse_qs(parsed.query, max_num_fields=1)['IDSchulzweig']
assert len(id_in_url) == 1

return id_in_url[0].strip()

# fix wrong tabs, spaces and new lines
def fix_data(self, string):
if string:
string = ' '.join(string.split())
string.replace('\n', '')
string.replace('\t', '')
return string

def normalize(self, item: Item) -> School:
tree = ET.fromstring(response.body)

namespaces = {
"gml": "http://www.opengis.net/gml",
"fis": "http://www.berlin.de/broker",
}
for school in tree.findall("gml:featureMember", namespaces):
data_elem = {}
for entry in school[0]:
if entry.tag == "{http://www.berlin.de/broker}geom":
# This nested entry contains the coordinates that we would like to expand
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem

@staticmethod
def normalize(item: Item) -> School:
return School(name=item.get('name'),
id='BE-{}'.format(item.get('id')),
address=item.get('address'),
zip=item.get('zip'),
city=item.get('city'),
website=item.get('web'),
email=item.get('mail'),
school_type=item.get('schooltype'),
id='BE-{}'.format(item.get('bsn')),
address=" ".join([item.get('strasse'), item.get('hausnr')]),
zip=item.get('plz'),
city='Berlin',
website=item.get('internet'),
email=item.get('email'),
school_type=item.get('schulart'),
legal_status=item.get('traeger'),
fax=item.get('fax'),
phone=item.get('telephone'),
director=item.get('headmaster'),
legal_status=item.get('legal_status'))
phone=item.get('telefon'),
latitude=item.get('lat'),
longitude=item.get('lon')
)

0 comments on commit a8152d1

Please sign in to comment.