Skip to content

Commit 0505b58

Browse files
committed
fix lintcode crawler
1 parent dc8decd commit 0505b58

File tree

3 files changed

+48
-27
lines changed

3 files changed

+48
-27
lines changed

scripts/lintcode.py

+36-19
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
# -*- coding: utf-8 -*-
33

44

5-
from pyquery import PyQuery as pq
5+
# from pyquery import PyQuery as pq
6+
import requests
67

78

89
class Lintcode(object):
@@ -11,36 +12,52 @@ def __init__(self):
1112
self.driver = None
1213

1314
def open_url(self, url):
14-
self.url = url
1515
print('open URL: {}'.format(url))
16-
self.driver = pq(url=url)
16+
url = url.strip('description')
17+
url = url.strip('/')
18+
self.url = url
19+
lintcode_unique_name = url.split('/')[-1]
20+
req_url = 'https://www.lintcode.com/api/problems/detail/?unique_name_or_alias={}&_format=detail'.format(lintcode_unique_name)
21+
self.driver = requests.get(req_url).json()
1722

1823
def get_title(self):
1924
print('get title...')
20-
title = self.driver('title').text()
25+
title = self.driver['title']
2126
return title
2227

2328
def get_description(self):
2429
print('get description...')
25-
desc_pq = self.driver('#description')
26-
desc_html = desc_pq('.m-t-lg:nth-child(1)').html()
27-
example_html = desc_pq('.m-t-lg:nth-child(2)').html()
28-
return desc_html + example_html
30+
desc = self.driver['description']
31+
notice = self.driver['notice']
32+
clarification = self.driver['clarification']
33+
example = self.driver['example']
34+
challenge = self.driver['challenge']
35+
desc_full = desc
36+
if notice:
37+
desc_full += '\n\n#### Notice\n\n' + notice
38+
if clarification:
39+
desc_full += '\n\n#### Clarification\n\n' + clarification
40+
if example:
41+
desc_full += '\n\n#### Example\n\n' + example
42+
if challenge:
43+
desc_full += '\n\n#### Challenge\n\n' + challenge
44+
45+
return desc_full
2946

3047
def get_difficulty(self):
3148
print('get difficulty...')
32-
progress_bar = self.driver('.progress-bar')
33-
original_title = progress_bar.attr('data-original-title')
34-
splits = original_title.strip().split(' ')
35-
difficulty = splits[1]
36-
ac_rate = splits[-1]
49+
mapping = {1: 'Easy', 2: 'Medium', 3: 'Hard'}
50+
difficulty = mapping.get(self.driver['level'], 'unknown')
3751
return difficulty
3852

3953
def get_tags(self):
4054
print('get tags...')
4155
tags = []
42-
for i in self.driver('#tags.tags a'):
43-
tags.append(i.text)
56+
for i in self.driver['tags']:
57+
if i['alias']:
58+
tags.append(i['alias'])
59+
else:
60+
tags.append(i['name'])
4461
return tags
4562

4663
def _get_related(self):
@@ -67,12 +84,12 @@ def get_problem_all(self, url):
6784
'difficulty': difficulty,
6885
'tags': tags,
6986
'description': description,
70-
'url': self._clean_url(url)
87+
'url': self.url
7188
}
7289
return problem
7390

7491

7592
if __name__ == '__main__':
76-
url = 'http://www.lintcode.com/en/problem/palindrome-number/'
77-
leetcode = Lintcode()
78-
print(leetcode.get_problem_all(url))
93+
url = 'https://www.lintcode.com/problem/topological-sorting'
94+
lintcode = Lintcode()
95+
print(lintcode.get_problem_all(url))

scripts/main.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,16 @@ def curr_time():
4040
problem_md = ''
4141
problem_slug = ''
4242
xxxcode = None
43+
convert_desc = True
4344
if raw_url.startswith('https://leetcode'):
4445
xxxcode = Leetcode()
45-
elif raw_url.startswith('http://www.lintcode.com'):
46+
elif raw_url.startswith('https://www.lintcode.com'):
4647
xxxcode = Lintcode()
48+
convert_desc = False
4749
problem = xxxcode.get_problem_all(raw_url)
4850
problem_slug = slugify(problem['title'], separator="_")
49-
problem_md = problem2md(problem)
50-
51+
problem_md = problem2md(problem, convert_desc)
52+
5153
if args.dir:
5254
post_dir = os.path.join(ROOTDIR, args.dir)
5355
post_fn = os.path.join(post_dir, problem_slug + '.md')

scripts/ojhtml2markdown.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,21 @@ def content(self):
2626
def leet_lint_url(url):
2727
problem_slug = url.strip('/').split('/')[-1]
2828
leetcode_url = 'https://leetcode.com/problems/{}/'.format(problem_slug)
29-
lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(problem_slug)
29+
lintcode_url = 'https://www.lintcode.com/problem/{}/'.format(problem_slug)
3030
urls = {}
3131
for url in [leetcode_url, lintcode_url]:
3232
response = requests.head(url)
3333
if response.status_code != 404:
3434
if url.startswith('https://leetcode'):
3535
urls['leetcode'] = url
36-
elif url.startswith('http://www.lintcode'):
36+
elif url.startswith('https://www.lintcode'):
3737
urls['lintcode'] = url
3838
else:
3939
print('cannot find url with: {}'.format(url))
4040
return urls
4141

4242

43-
def problem2md(problem):
43+
def problem2md(problem, convert_desc=True):
4444
metadata = {
4545
'title': problem['title'],
4646
'difficulty': problem['difficulty']
@@ -49,8 +49,10 @@ def problem2md(problem):
4949
metadata['tags'] = problem['tags']
5050

5151
description = problem['description']
52-
h = html2text.HTML2Text()
53-
description_md = h.handle(description)
52+
description_md = description
53+
if convert_desc:
54+
h = html2text.HTML2Text()
55+
description_md = h.handle(description)
5456

5557
lines = []
5658
lines.append('# ' + problem['title'] + '\n')

0 commit comments

Comments
 (0)