diff --git a/.gitignore b/.gitignore index b32d8aa..ec6a0de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,18 @@ *.pyc -.idea \ No newline at end of file +.idea +*.ipynb +test +ipynb +etlpy.egg-info +dist +data +EGG-INFO +.vscode +.ipynb_checkpoints +etlpy.egg-info +.DS_Store +__pycache__/ +etlpy/__pycache__/ +etlpy/pinhole.log +insurance.json +pinhole.log diff --git a/README.md b/README.md index e3cc213..f522f26 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,31 @@ -# etlpy -##designed by desert -a smart stream-like crawler & etl python library -##1.简介 -etlpy是基于配置文件的数据采集和清洗工具。 +# etlpy: Python编写的流式爬虫系统 -写爬虫和数据清洗代码总是很烦人。因此,应该通过工具生成爬虫和数据清洗的代码! etlpy就是为了解决这个问题而生的。 +## 简介 -通过可视化和图形化设计工具,快速生成爬虫和数据清洗流程,并保存为xml文件,并由etlpy引擎解析它,即可获得最终的数据结果。 +etlpy是纯Python开发的函数库,实现流式DSL(领域特定语言),能一行内完成爬虫,文件处理和数据清洗等。能和pandas等类库充分集成。 -##2.使用 -使用起来非常简单: +它和linux的bash pipeline,C#的Linq以及作者本人开发的Hawk有高度的相似性。 + +下面一行代码实现了获取博客园第1到10页的所有html: ``` -from etl import ETLTool -tool = ETLTool(); -tool.LoadProject('project.xml', '数据清洗ETL-大众点评'); -datas = tool.RefreshDatas(); -for r in datas: - print(r) +from etlpy.etlpy import * +t= task().p.create(range(1,10)).cp('p:html').format('http://www.cnblogs.com/p{_}').get() +#t.to_df() 生成DataFrame +for data in t: + print data + ``` -RefreshDatas函数返回的是生成器,通过for循环,即可自动读取所有数据。 +把上面的t改成下面的语句,自动监测算法就能自动分析网页结构,生成解析脚本: + +`t=task().create().url.set('http://www.cnblogs.com').get().tree().detect()` -##3.基本原理 -模块分为 生成,过滤,排序,转换,执行四种。 -利用Python的生成器,可以将不同模块组织起来,定义一个流水线,数据(python的字典)会在流水线上被加工和消费。 +在p列生成从1到10的数,拷贝p列到html列,将html列合并为url,并发送web请求,最后的html正文保存在html列。 -图形化工具是用C#开发的,使用了类似Python生成器的Linq技术。其原始思路来自于Lisp的s-表达式。 +etlpy的特性有: -##4. 用途 -爬虫,计算,清洗,任何符合一定计算范式的数据,都可以使用它来完成。 +- 同时支持python2和python3 +- 内置方便的代理,http get/post请求,写法与requests库非常相似 +- 内置正则解析,html转义,json转换等数据清洗功能,直接输出 +- 能方便地将任务按照协程,线程,进程,和多机分布式的方式进行任务并行 \ No newline at end of file diff --git a/batch.sh b/batch.sh new file mode 100755 index 0000000..9e4fbbf --- /dev/null +++ b/batch.sh @@ -0,0 +1,4 @@ +for((i=0; i<$1; ++i)) +do + nohup python src/distributed.py client $2 & +done diff --git a/distributed.py b/distributed.py deleted file mode 100644 index 457bd8f..0000000 --- a/distributed.py +++ /dev/null @@ -1,139 +0,0 @@ -import sys; -from queue import Queue -from multiprocessing.managers import BaseManager -import etl; -import json -import extends; -import time; -authkey= "etlpy".encode('utf-8') -timeout=1; -rpc_port=8888 - -class ETLJob: - def __init__(self,project,jobname,config,id): - self.project= project; - self.jobname=jobname; - self.config=config; - self.id= id; - -class JobResult: - def __init__(self,name,count,id): - self.name=name; - self.count=count; - self.id=id; - -class Master: - - def __init__(self,project,jobname): - # 派发出去的作业队列 - self.dispatched_job_queue = Queue() - # 完成的作业队列 - self.finished_job_queue = Queue() - self.project= project; - self.jobname=jobname; - self.maxprocess= 10; - - def get_dispatched_job_queue(self): - return self.dispatched_job_queue - - def get_finished_job_queue(self): - return self.finished_job_queue - - def start(self,skip=0): - # 把派发作业队列和完成作业队列注册到网络上 - BaseManager.register('get_dispatched_job_queue', callable=self.get_dispatched_job_queue) - BaseManager.register('get_finished_job_queue', callable=self.get_finished_job_queue) - - # 监听端口和启动服务 - manager = BaseManager(address=('0.0.0.0', rpc_port), authkey=authkey) - manager.start() - - # 使用上面注册的方法获取队列 - dispatched_jobs = manager.get_dispatched_job_queue() - finished_jobs = manager.get_finished_job_queue() - - job_id = 0 - module= self.project.modules[self.jobname]; - - proj=json.loads(json.dumps(etl.convert_dict(self.project,self.project.__defaultdict__), ensure_ascii=False)) - while True: - for task in etl.parallel_map(module): - job_id = job_id + 1 - if job_id1: - ip=argv[1]; - if len(argv)>2: - port=int(argv[2]); - slave= Slave(); - slave.start(True,ip,port); - - diff --git "a/docs/1.0\347\273\274\350\277\260.md" "b/docs/1.0\347\273\274\350\277\260.md" new file mode 100644 index 0000000..ffd09d4 --- /dev/null +++ "b/docs/1.0\347\273\274\350\277\260.md" @@ -0,0 +1,65 @@ +# etlpy: A streaming DSL in Python + +## Intro + +etlpy is a function library written in Python, you can write code in even one line to do complicated web crawler, file processing and data filtering, which can be integred with Pandas, requests. + +etlpy是纯Python开发的函数库,实现流式DSL(领域特定语言),能一行内完成爬虫,文件处理和数据清洗等。能和pandas等类库充分集成。纯链式操作,代码极简。 + +The design philosophy comes from: +- bash pipeline in linux +- Linq in C# +- filter system in jinja2(a template engine) +- flink and blink +- Hawk by same author. + +它和linux的bash pipeline,C#的Linq, jinja2的过滤器(filter)以及作者本人开发的Hawk有高度的相似性。 + +the following code can get html from homepage to page 10 in website cnblogs: + +下面一行代码实现了获取博客园第1到10页的所有html: +``` +from etlpy import * +t= task().p.create(range(1,10)).cp('p:html').format('http://www.cnblogs.com/p{}').get() + +for data in t: + print data + +``` + +It means generate num from 1 to 10 in column p, merge column p to column html, then format string as url like below, send web requests to the certain url and get the html. + +Finally, you can get all data from t using iterator. + +意思是指:在p列生成从1到10的数,拷贝p列到html列,将html列合并为url,并发送web请求,最后的html正文保存在html列 + +etlpy supports: +- Python2 & 3 +- http proxies, get/posts, really same as famous Python requests library +- regex, filter, html format and clean +- running code in parallel mode without modifying code. + +etlpy的特性有: + +- 同时支持python2和python3 +- 内置方便的代理,http get/post请求,写法与requests库非常相似 +- 内置正则解析,html转义,json转换等数据清洗功能,直接输出 +- 能方便地将任务按照协程,线程,进程,和多机分布式的方式进行任务并行 + + + + + + + + + + + + + + + + + + diff --git "a/docs/1.1\345\277\253\351\200\237\345\205\245\351\227\250.md" "b/docs/1.1\345\277\253\351\200\237\345\205\245\351\227\250.md" new file mode 100644 index 0000000..351692a --- /dev/null +++ "b/docs/1.1\345\277\253\351\200\237\345\205\245\351\227\250.md" @@ -0,0 +1,158 @@ +# 快速入门 + +我们依然以抓取大众点评为例,讲解使用etlpy的全过程。在这个例子里,可以学习到: + +- etlpy的核心思路 +- 从网页中自动嗅探信息 +- 设置cookie来伪装成浏览器 +- 如何保存结果到文件 +- 设置代理防爬 + +先安装etlpy: +`pip install etlpy` +完整的示例代码在: +> + +注意事项: + +1. 大众点评对于列表页的访问不会禁止,但对详情页的过度访问会导致被封禁。 + +## 1. 最简单的例子: + +先访问一个搜索页面的html: +``` +from etlpy.etlpy import * +url='http://www.dianping.com/' +url_s= '/search/category/3/75/g2878' +t = task().create().url.set(url + url_s).get() +for data in t: + print(t) +``` + +etlpy会默认创建一个工程,可在工程中创建task,task函数的参数可以指定该task的名字。之后的每个操作都是算子(op),op分为以下几类: +- 生成器 +- 转换器 +- 过滤器 +- 执行器 + +我们先通过create创建一张空表,行数为1,列数为0,`.url`表示针对url列(由于url列之前不存在,所以新增),设置它的值为`url+url_s`,之后用get函数获取这个url的Html。etlpy在请求方面高度模拟了requests库,因此还能post。 + +> get不传入参数,是如何知道要获取什么url? +当你设置了url这个列之后,除非通过某些算子改变到新的列,否则之后所有的op都是针对该列进行的。get算子会把html内容覆盖到url列里。 +> 为什么要用create跟在task后面? +任何task都需要通过生成器作为起点生成数据,create可以接受字典数组,pandas对象等参数,创建一个数据表。 + +结果是个字典的迭代器,你可以只打出网页的html: +`print(t['url'])` + +如果希望保留url列,把获得的html放在另外一列,则可以: + +`t = task().create().url.set(url + url_s).cp('_:html').get()` + +它指代将url列的内容拷贝到html列,之后在html列上get数据。cp类似linux的cp操作,拷贝后,对列的指针就会移动到html列了。 + + +## 2. 获取网页的列表信息 + +下面体验etlpy一个非常强大的功能:自动嗅探,对该列表页,我们希望能获得它的列表内容,在后面跟detect算子: + +`t = task().create().url.set(url + url_s).get().detect()` + +它会打印出下面的信息: + +``` +.xpath('/html/body/div/div[4]/div[6]/div').list().html().tree()\ +.cp('url:diggnum').xpath('//div[1]/div[1]/span')[0].text()\ +.cp('url:titlelnk').xpath('//div[2]/h3/a')[0].text()\ +.cp('url:post_item_summary').xpath('//div[2]/p')[0].text()\ +.cp('url:lightblue').xpath('//div[2]/div/a')[0].text()\ +.cp('url:gray').xpath('//div[2]/div/span[1]/a')[0].text()\ +.cp('url:col5').xpath('//div[2]/div/span[2]/a')[0].text()\ +#diggnum : #1 +#titlelnk : #NodeJs之数据库异常处理 +#post_item_summary : # +#lightblue : #leslie·Zhao +#gray : #评论(0) +#col5 : #阅读(70) +``` +这段代码当你有经验之后,可以自己手工编写。我们先不急解说它的原理,先把它按部就班地拷贝到之前流的后面,注意要删掉detect +``` +t = task().create().url.set(url + url_s).get()\ +.xpath('/html/body/div/div[4]/div[6]/div').list().html().tree()\ +.cp('url:diggnum').xpath('//div[1]/div[1]/span')[0].text()\ +.cp('url:titlelnk').xpath('//div[2]/h3/a')[0].text()\ +.cp('url:post_item_summary').xath('//div[2]/p')[0].text()\ +.cp('url:lightblue').xpath('//div[2]/div/a')[0].text()\ +.cp('url:gray').xpath('//div[2]/div/span[1]/a')[0].text()\ +.cp('url:col5').xpath('//div[2]/div/span[2]/a')[0].text() +``` +然后再执行,是不是整个列表页都能获取了?结果如下。 + +先解释下xpath算子,它能解析html,并将对应xpath的列表提取出来,但由于它会返回多个内容(因为满足xpath语法的节点可能不止一个),使用list算子可以将内部列表扩展到外部的表格(参考备注1)。 + +之后每个数据都是etree节点,使用html算子将其转换为html,再转换为树节点。 + +我们重点解释下面的脚本,text算子能够将某etree节点的文本提取出来。 + +cp('url:diggnum').xpath('//div[1]/div[1]/span')[0].text() + +由于有多个属性列,因此可通过多个上面的表达式级联来获得所有的属性。 + +如果你希望获取某个节点的html,则可以将text算子改成html。 + + +新增的列属性名是通过自动推断来获得的,可修改cp中的`:`来设置新的列名。 + + +## 3. 设置cookie + +由于get/requests请求需要众多其他的参数,例如cookie,hosts等,甚至还要不停地切换代理,多个算子可能会共享相同参数,也可能在参数上会有细微的不同。因此如何向etlpy的算子中传递参数就成了有趣的问题。 + +我们可将参数表达成一棵树,通过构造表达式树,etlpy在执行时才会对树的叶子节点求值,并传递给算子。废话不多说,通过下面的例子直接说明: + +``` +params='''Accept-Encoding:gzip, deflate +Host:www.dianping.com +User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36''' +headers = para_to_dict(cookie, '\n', ':') +r = request_param +r = r.merge('headers', headers) +``` +request_param为系统默认的参数,通过将字符串转换为字典,并将该headers传入,就实现了树的构建。 +之后将r直接传入到get/post算子的参数中即可。 + +若要设置代理呢? +通过查看requests库的文档,requests需要以如下方式设置代理: +``` +r= requests.get(url,proxies={'http://ip:port'}) +``` +因此,下面实现了一个动态的代理模块: +--补充代码 + +随机求值器会在每次执行时动态获取代理,从而实现代理的均匀调用。 + +## 4. 如何设置翻页? + +## 5. 获取所有类目的所有数据 + +## 6. 写入数据 + +我们可以将生成的数据保存到文件,或导出到pandas的Dataframe. +.to_df(),也可以写入json + +但是,若数据量特别大,而且无法进行分布式时 + +## 7. 并行执行 + +希望获取大众点评全网的数据时,用上述脚本执行会非常的慢。因此我们需要考虑并行化。 + +etlpy对并行化的使用简单地令人发指。在query或execute参数上,可设置mode=PROCESS_MODE(多进程模式),它就会分析算子序列,分析生成结果,自动在其中插入并行算子。 + +如果你对它生成的结果不满意,也可以手工地添加pl算子,也可以强行指定算子的执行模式(多少个worker,多少条数据为一个分组,多线程/多进程)。 之后etlpy执行引擎会将整个流切分成一段段,构造为带层级关系的map-reduce工作流。 + +并行模式涉及到不少细节,也可能导致意想不到的问题。这一部分我们会在并行化中详细介绍。 + + + + + diff --git "a/docs/1.2\346\240\270\345\277\203\346\246\202\345\277\265.md" "b/docs/1.2\346\240\270\345\277\203\346\246\202\345\277\265.md" new file mode 100644 index 0000000..84de0fc --- /dev/null +++ "b/docs/1.2\346\240\270\345\277\203\346\246\202\345\277\265.md" @@ -0,0 +1,146 @@ + + +# 核心概念 + +使用etlpy,先介绍以下核心概念: + +## 数据流(generator) + +etlpy特别适合批量处理数据,我们称为数据流,典型的数据流如: + +- 数组(数组元素可能是字典或对象), +- 一行行的文本 +- Excel,pandas的DataFrame表格 +- 1000个web请求 + +它们共同的特征是每个元素都很相似,能通过迭代器访问。为了简化讨论,我们处理的就是字典的迭代器。想象一个工作流,它能不断地生成和加工字典,最后将字典输出。 + +# Core Concepts + +Before detailed introductions, we will list core concepts as follows: + +## generator + + + +## 属性(property) + +一个字典如 `{'a':22,'b':33}`,那么a,b就是字典的键,如果有多个字典,都有a,b两个属性,就能形成表格,a,b就是列了,因此`属性`和`列`,可认为是一个概念。 + +真实的数据处理,一般都会对某个特定属性做连续操作。当let和其他算子指定了某些列后(可以设置多个),etlpy就会记住这个列名,之后所有的操作都针对这些列,直到重新设定了列,被操作的列称为目标列,这样可以简化代码. + +## 算子(tool) + +算子(op)可以对字典做修改,所有的算子分为四种类型: + +- 生成器(GE):如生成100个字典,键为p,值为‘1’到‘100’ +- 转换器(TF):如将'地址'列中的数字提取到'电话'列中 +- 过滤器(FT):如过滤所有某一列的值为空的的字典 +- 执行器(GE):如将所有的字典存储到MongoDB中。 +- 排序器 (ST):如将数据流按a列进行降序排列 + + +算子可以类比于加法和乘法等基本操作。etlpy提供了简单方便的文件读写,web访问等算子,你也可以方便地扩展其他工具来增强功能。 + +绝大多数算子都包含一个最常用参数,它作用在目标列上,个别算子包含一些可选参数。例如: + +``` +task().create(datas).a.split(',') #split的参数就是分割字符,作用在a列 +``` + +参数也可以从其他列读取,上面的代码等价于(具体参考`表达式章节`): +``` +task().create(datas).p.set(',').a.split('[p]') #等价于上面的值,方括号表达式指代从其他列读取 +``` +> 算子的设计,尽量追求功能正交化,只实现一个功能,一般算子作用于目标列,参数为默认参数(一部分算子会作用在整个流或者数据上)。 + + +## 任务(task) + +所有的任务都以task()来定义,通过组合不同的算子,就能定义完整的task. + +通常来说,task都以生成器开头: +``` +generator= task('task_name').let('p').create(range(1,20)) + +for r in t: + print r + +# 等价于: +def generator(): + for i in range(1,20): + yield {'p':i} +for r in generator(): + print r + + +``` +task()函数的参数即该任务的名称,每个任务都需要独一无二的名称,默认为'task' + +任务可以理解为函数,可以定义多个任务,父任务可调用子任务。任务也可以调用自身。任务能够被切分,或者保存为json等格式,在网络上传输。 + +> 之后的示例代码为了简化,可能会省略`task()`开头。 + +创建task并不会使其执行,它是惰性的。etlpy提供了非常简洁明了的调用方法: + +先创建一个任务:`t=task().... # 省略之后的算子` + +可以用迭代器,依次访问生成的每个元素: +``` +for item in t: + print t +``` + +可以直接调用,参数为获取该流的前n个元素,生成字典数组: + +`item_list= t(100) ` + +可以将task导出为json,yaml,同样也能通过json导入: + +``` +js= task.to_json() +t2=task().load_json(js) +``` + +task还可以通过字符串构建,例如: + +`t=task().eval('p.range(1,10).format('{_}{_}')')` + +多个task可以直接相加: + +`t1+=t2`等价于`t1.subge(t2,mode='+')` 即把任务直接拼接起来。 + +也可以相乘,等价于元素各自做笛卡尔积:`t1*=t2` + +也可以求或,等价于两边的数据表横向拼接: `t1|=t2` + +## 参数(param) + +对算子可能需要传参,若要传的参数较多,会影响可读性,若很多算子都使用相似的参数,那么就更难控制了: + +`t=task().get(url='123',cookie='xxx')...get(url='234',cookie='xxx',data=None)` + +因此,考虑将传给算子的参数重新包装,作为字典,那么参数传递就变得很容易了: +``` +p= Param(url='123',cookie='xxx') +`t=task().get(p)...get(p)` +``` + +param是具有树结构的字典,key为参数名,值为str/int/lambda,也可以是另一个param。 当param被求值时(p.eval()),它才会递归地计算其中所有的值,最终输出一个POCO类型的字典。 + +目前etlpy提供了几种默认的param类型,足以满足大部分需要: 从数组中随机选择值(可用于代理随机分配),或lambda表达式,或从目前的数据流里动态求值(使用`[]`表达式)。 + +param最有趣的地方在于,你可以为param来merge新的参数,生成新的param。如下图所示,b比a多了cookie。这样就能适应复杂多变的为算子传递参数的需求。 + +TODO: 需要画图 + +### 工程(project) + +一个工程维护了多个任务,这些任务互相可能有依赖。工程包含了默认的环境(env),包含了所有的task和param。 + +工程是任务离线保存的最小单元,它也能被保存为json,yaml等用于网络传输等用途,并完整地序列化出等价的project。 + +使用`from etlpy import *` ,就会创建默认的工程。 + + + diff --git "a/docs/1.3\345\207\240\347\247\215\350\241\250\350\276\276\345\274\217.md" "b/docs/1.3\345\207\240\347\247\215\350\241\250\350\276\276\345\274\217.md" new file mode 100644 index 0000000..32cc07c --- /dev/null +++ "b/docs/1.3\345\207\240\347\247\215\350\241\250\350\276\276\345\274\217.md" @@ -0,0 +1,76 @@ + +# 几种表达式 + +## 方括号 + + +算子的参数可以从其他列读取,使用方括号,可以从当前的字典读取其他列的值,作为算子参数。 + +``` +task().create(datas).p.set(',').a.split('[p]') #等价于上面的值,方括号表达式指代从p列读取 +``` + +> 目前还不支持嵌套表达,不支持类似`[p][p2]` +> 方括号可以用于任意算子的主要参数。 +> 最常见的是生成区间数。 +> 方括号语法还能使用在参数系统param中。 + + +## 范围 + +表示一个范围,例如下面的例子: + +- `1` 只表示1 +- `1:10` 1到10之间的正整数 +- `1:10:2` 1,3,5,..9 +- `[p]:[q]:[m]` 如`1:10:2`,但值是从其他列读取的。 + +范围表达式可用于create,range,notin等算子中。 + +## 格式化 + +用于format算子,使用方法和python标准的format用法一致。请参考相关文档 + +## 列转换 + +用于mv,cp,cp2,keep,let等算子, 有下面几种情况: + +- p p列 +- `p q r` pqr三个列,用空格分割 +- `_ q r` 其中`_`代表上一个算子的目标列 +- `p:q r i:j` p列转换到q列,r不变,i转换到j列。 +- `[p] q r` 第一个列名要从p列读取 + +如果想让后面的算子针对整个字典,有两种办法: + +- `...let('')` 传入空参数 +- `...html()._...` + +其中用attr表示的`_`等价于let('_') + +> 可以参考map,where和sort的用法 + +## 函数表达式 + +在map和where算子中,可以传入字符串的函数作为参数。假设数据是字典`{'p':10,'q':30}` + +分两种情况: + +- 作用在目标列(该列的数据用_表示) + +`p.map('_>5')` + +- 作用在整个数据 (该字典用_表示,每个属性可以直接访问,如p,q) + +`_.map('p>5 and q<3')` + +但不是所有的列名都是合法的python变量,例如`p['12']`,因此还可以通过索引器来访问,上面的代码等价于: + +`_.map("_['p']>5 and _['q']<3")` + + +> _表示上一个算子的目标列/数据 + + + + diff --git "a/docs/2.0\347\256\227\345\255\220.md" "b/docs/2.0\347\256\227\345\255\220.md" new file mode 100644 index 0000000..c17197c --- /dev/null +++ "b/docs/2.0\347\256\227\345\255\220.md" @@ -0,0 +1,85 @@ +# 算子 + +## 1. 分类 + +算子会对属性做处理,按照其作用域,算子能够分为三种: + +- col:作用于特定的属性: 如清除特定属性的空白符 +- data:作用于整个字典: 例如清除所有以a开头的属性 +- stream:作用于数据流: 只获取数据流的前三个数据,或跳过2个数据。 + +## 2. 功能表 + +下面的表,整理了算子的使用方法: + +| 名称 | 参数 | 功能 | 执行后结果 | 例子 | 类型 | +| -------- | ------ | -------- | ------ |-------- | ------ | +| | | | 设定目标列 | | | +| let | 'new' | 设置目标列为new | new | let('new') |TF-data | +| mv | 'old:new' | 从原始列移动到新列 | new | mv('a:b') |TF-data | +| cp | 'old:new' | 从原始列复制到新列 |  new | cp('a:b') |TF-data | +| cp2 | 'old:new' | 从原始列复制到新列 | old | cp('a:b') |TF-data | +| rm | 'a' | 删除a列 | 不变 | rm('a b') | TF-data | +| keep | 'a' | 保留a列 | a | keep('a') |TF-data| +| | | | 转换器 | | | +| incr | 无 | 增加自增主键 | 不变 | incr() |TF-col | +| split | 分隔符 | 对目标列分割 | [] | split(',') |TF-col | +| into | 'a b' | 将数组分配到不同列 | 无 | into('a b') | TF-data| +| at | 索引 | 提取数组/字典的项 | 无 |at('a') | TF-data| +| escape | html/url | 进行html/url转义 | str | escape('html') | TF-col| +| clean | html/url | 进行html/url反转义 | str | clean('url') |TF-col| +| format | 合并表达式 | 合并多个列 | str | format('{0}-{col}') | TF-col| +| regex | 正则表达式 | 获取节点的text或str | regex('\d+') | [str] | TF-col| +| replace | 正则表达式 | 替换匹配的值 | replace('\d+',value='new') | str| TF-col| +| num | 无 | 提取数字 | num() | [float] | TF-col| +| strip | char | 清除首尾符号 | strip() | str | TF-col| +| extract| 起始字符串 | 首尾夹逼提取 | extract('start',end='end') | str | TF-col| +| map| func/lambda | 对每个元素操作 | map(print) | ? | TF-col/data| +| dump| 'json/yaml/xml' | 将目标列导出 | dump('json') | str | TF-col| +| load| 'json/yaml/xml' | 从文本导入 | load('json') | obj | TF-col| +| todict| 'a b' | 将a,b列合并到字典 | todict('a b') | obj | TF-data| +| drill| 'a:b' | 将目标列字典提到外部 | drill('a:b') | 无 | TF-data| +| | | | 爬虫工具 | | | +| html | 无 | 获取节点的html | html() |str| TF-col| +| text | 无 | 获取节点的text或str | text() |str| TF-col| +| xpath | xpath-str | 通过xpath提取html | [node] | xpath('//div') | TF-col| +| pyq | jquery-str | 通过jquery提取html | [node] | pyq('.a')| TF-col| +| detect | int | 自动嗅探网页列表 | 表格+xpath | detect(0)| TF-data| +| cache | 空数组 | 缓存之前结果 | 无 | cache([])| TF-data| +| get | 发送的数据 | http-get请求 | 无 |get('[p]')| TF-col| +| post | 发送的数据 | http-post请求 | 无 |post('[p]')| TF-col| +| tree | html | 对html建立etree | 无 | tree() | TF-col| +| search | 关键字 | 对html搜索xpath | 打印xpath | search('aa') | TF-col| +| dl | 目标路径 | 将目标列链接下载到某路径 | 无 | dl('abc.zip') | TF-col| +| | | | 流处理 | | | +| take | 数量p | 获取前p个元素 | 无 | take(5) |TF-stream | +| list | 要提取的列 | 将某列数组数据上钻 | 新列表 | list('a b c') |TF-stream | +| skip | 数量p | 跳过前p个元素 | 无 | skip(5) |TF-stream | +| last | 无参数 | 获取最后的元素 | 无 | last() |TF-stream | +| count | 数量p | 每p个元素打印index | 不变 | count(5) |TF-stream | +| tag | 注释 | 仅需注释 | 不变 | tag('注释') |TF-stream | +| agg | 函数/lambda | 对相邻数据聚合 | 不变 | agg(lambda a,b: a+b) |TF-stream | +| delay | 数量p | 延迟p毫秒 | 不变 | delay(100) |TF-stream | +| pl | 并行模式 | 插入并行算子 | 无 | pl(5) |TF-stream | +| | | | 生成器 | | | +| create | 生成器/数组 | 生成数据 | 不变 | create([{}]) |GE-data/col| +| range | 区间表达式字符串 | 生成固定区间数 | 不变 | range('1:100:2') |GE-col | +| read | 文件路径 | 按文件行读取 | 不变 | read('a.txt') |GE-col | +| | | | 过滤器 | | | +| where | 函数/lambda | 对数据做过滤 | 不变 | where(lambda x:x!=None) | FT-col| +| null | 无参数 | 过滤空对象 | 不变 | null() | FT-col| +| repeat | 无参数 | 过滤重复数据 | 不变 | repeat() | FT-col| +| match | 正则 | 过滤非匹配数据 | 不变 | match('\d') | FT-col| +| | | | 执行器 | | | +| dbex | 连接器名称 | 将数据写入连接器 | 不变 | dbex('connector') |EX-data | +| write | 文件路径 | 将数据按行写入文件 | 不变 | write('abc.txt') |EX-col | +| | | | 排序器 | | | +| ascend | 函数/lambda | 升序排列 | 不变 | ascend(lambda x:x[0]) |ST-data | +| descend | 函数/lambda | 降序排列 | 不变 | 同上 |ST-col | +| | | | 子任务 | | | +| subge | 子任务名称或实例 | 以生成器方式调用| 不变 | subge(task().create()..) |GE-stream | +| subex | 子任务名称或实例 | 以执行器方式调用| 不变 | subex('task_name') | EX-stream | +| sub | 子任务名称或实例 | 以转换器方式调用| 不变 | sub('task_name') | TF-stream | + + + diff --git "a/docs/2.1\347\224\237\346\210\220\345\231\250.md" "b/docs/2.1\347\224\237\346\210\220\345\231\250.md" new file mode 100644 index 0000000..ff9074a --- /dev/null +++ "b/docs/2.1\347\224\237\346\210\220\345\231\250.md" @@ -0,0 +1,58 @@ +# 生成器 + +生成器用于生成数据,流的开头一定是个生成器。如果生成器插在流的中间,就需要与已有的数据流做融合。控制融合的方式是默认的mode参数: + + - cross(*) : 笛卡尔积 + - append(+): 纵向拼接 (默认) + - merge(|) : 横向拼接 + - mix (mix): 类似ABABAB,依次交错 + +下面介绍需要集几种常见的生成器: + +## create + +### 不指定目标列 + +参数需要为字典数组或字典生成器,例如: +``` +task().create(('a':i for i in range(10))) +{'a':1} +{'a':2} +{'a':3} +... +``` +参数也可以为pandas的DataFrame,生成对应的字典 + +### 指定目标列 + +若指定目标列,参数为数组或生成器,元素可以为任意类型,上面的代码等价于: +`task().a.create(range(10))` + +读取文件,并放置于l列中: +`task().l.create(open('file').readlines())` + +参数还可以为数字,为生成空字典的个数: + +`task().create(5).p.set(10)` + +- 输出10个字典,值皆为 {'p':10} + + +## range + +range用于创建区间数,其参数可以有三种形式: + +- 元组或列表(元素数量1到3个),如 range((1,10,2)) +- 字符串如'1:10', '10', '1:10:1', 上面的代码也等价于: `task().a.range('1:10:2')` +- 单个参数,如range(10) +为何要支持字符串形式呢? 因为最大值或间隔可能来自其他列: + +`task().create().p.create([20]).a.range('1:[p]',mode='*')` + +上面的代码为:先创建列p,值为20,再在a列上创建区间数,范围从1到20(p列的值), 组合模式为* (cross). + +> 为何range((1,10))不直接写成range(1,10)? 这样岂不是更符合python的规范?原因是range还有可选参数,如果range定义成: + +`def range(start=0,end=1,interval=2,mode='*')` +是无法判断range(10)中的10到底是start还是end的。 + diff --git "a/docs/2.2\350\275\254\346\215\242\345\231\250.md" "b/docs/2.2\350\275\254\346\215\242\345\231\250.md" new file mode 100644 index 0000000..d185b63 --- /dev/null +++ "b/docs/2.2\350\275\254\346\215\242\345\231\250.md" @@ -0,0 +1,197 @@ +# 转换器 + +由于转换器非常多,因此仅对部分需要重点讲解的部分进行说明。 + + +## let和set +let不修改列,但负责指定目标列,通过let算子,可指定要处理的列,例如: + +`task().create(datas).let('a').strip().format('{}{}')` + +让a属性的值,去除空白符,并将其值重复两次。对多个列可批量处理,中间用空格分割。如`a b c` + +- let('*') 表示对所有的列做处理 +- 参数可以写正则表达式,能成功匹配的列都能做处理。 + +如果只有一个列,且列名符合python命名规则,则let函数可以省略,上面的代码能简化为: + +`task().create(datas).a.strip().format('{}{}').b....` + +set算子可以对固定列设置值,例如: + +`p.set('value')` + +set算子还能将别的列的值填充到目标列,上面的等价为: + +`a.set('value').p.set('[value]')` + + + +## 目标列转换器 + +cp,cp2,mv,rm,keep 的功能可参考算子表,此处从略。`keep('a')`会删除除了a列以外的所有列。 + +如果需要对多个列进行批量处理,列名可以用空格分割:`mv('a b c')` + +cp,cp2需要接受多个键值对,例如从a列拷贝到b列,c列拷贝到d列,则可以写作: +`cp('a:b c:d')` + +对a列保留,并将c列移动到d列,有两种写法: +`keep('a c').mv('c:d')` 或: +`keep('a c:d)` + +> 注意cp和cp2的区别,cp拷贝后做滑动,cp2不做滑动。cp2等价于cp('a:b').let('a'), mv,cp,rm的命名都参考了linux的风格 + + +## at + +对目标列数据取索引值,并放置在本列位置。使用方法和python的字典和数组的索引使用方法一致。at可以省略,例如: + +`create(datas).p.at(0).at('hello')` + +等价于: + +`create(datas).p[0]['hello]` + +at还支持输入slice: + +`create(datas).p[2:10]` + +很多算子的结果都是数组,字典或生成器,都可以用at算子将其数据提出。对生成器,则输出第n个元素。 + +> 对生成器,慎用类似`-1`的倒序索引,这可能会导致实例化生成器,从而导致性能问题。 + +## 字符串分割 + +由于etl中会出现大量字符串分割的需求,因此提供了split和into算子,以及at算子 + +split会将字符串分割为数组,如a列值为'1 2 3',若想放置到a,b,c三列,则: + +`a.split(' ').cp('a:b')[1].cp('a:c)[2].a[0]` + +上面的代码也等价于: + +`a.split(' ').into('a b c')` + + +## 字符串转义,加载 + +escape代表对字符串转义,clean则代表反转义。从网页中常见nbsp等,其实是空格,则可以: + +`a.clean('html')` + +带代表使用html反转义功能,若想对url编码,则可以: + +`a.escape('url')` + +类似的还有load和dump,如加载a列中文本形式的json文件,并将其转换为xml + +`a.load('json').dump('xml)` + +目前load/dump支持 xml,json,yaml这几种类型 + +## 正则表达式 + +正则可以方便地提取文本的内容,etlpy支持对流进行匹配和过滤。涉及的算子有re,num + +re可将内部的匹配结果以数组输出,例如 a='123 345 678',需要将内部的字符串提取出来: + +`a.re('\d+')[0] #a列放置123` + +num是re的特例,用于提取浮点数和整数。 + +## 上钻下探 + +### list + +当目标列中放置着一个字典数组/字典生成器)时,一个可能的需求是将其提取并代替外部的流。list可以作为一转多的算子实现该功能, 例如: + +`task().p.create(('a':i for i in range(10))).list()` + +其等价于: +`task().a.create(range(1,10))` + +注意,在一转多之后,原始流的列会被丢弃,如果不想丢弃,可以在list的参数中传递列表达式, 如`list('p:p1 p2')`会将p列保存在新流的p1列,p2列保存在新流的p2列。语法与cp,keep一致。 + + +### drill和todict + +类似地,目标列如果是字典(注意,不是字典的数组),需要将其合并到外部,需要用drill算子。 + +`p.set({'a':1,'b':2}).drill()` 等价于: + +`p.a.set(1).b.set('2')` + +若drill没有参数,则默认转换所有的列。drill也支持参数,语法和list一致。 + +如果希望把多个列的数据合并到目标列,并保存到字典中,则使用todict,例如: + +`a.set(1).b.set(2).p.todict('a b')` + +todict经常用于向网站post字典数据等。 + + +## 字符串处理 + +strip和python标准的strip用法类似。 + +extract可以通过首位字符提取字符串,特别适合用于在复杂文本中抽取数据。例如p列保存为:'abc 123 def',要提取123,则 + +`a.extract('abc',end='def')` + +默认参数为start字符串, end填写要结尾的字符串,提取的内容为两个字符串中间的部分。 + +format可以将多个列合并到一列,用法与python标准的format一致,例如将p列改造为url: + +`p.set(1).format('www.abc.com/p{}')` #输出 www.abc.com/p1 + +也可以合并多个列: + +`p.set(1).q.set(2).format('{[p]}:{[q]}')` #输出 1-2 + + +## 流处理 + +take,skip能对流做裁剪,例如: + +`p.create(range(1,10)).skip(3).take(2)` + +输出: {'p':3}, {'p':4} + +last能获取流的最后一个元素 + +`p.create(range(1,10)).last()` + +输出: {'p':9} + +## 调试和输出 + +具体用法可参考调试和输出章节,以下标签都可以在流代码的任何位置插入: + +tag标签,其功能仅仅用于显示代码功能: + +`p.tag('这里输出注释').list()...` + +count标签,用于每输出p个元素,就会输出一次位置 + +`p.range(100).count(20)` + +输出: 20,40,60.... + + + + + + + + + + + + + + + + + + diff --git "a/docs/2.3\347\210\254\350\231\253\350\275\254\346\215\242\345\231\250.md" "b/docs/2.3\347\210\254\350\231\253\350\275\254\346\215\242\345\231\250.md" new file mode 100644 index 0000000..f10b18a --- /dev/null +++ "b/docs/2.3\347\210\254\350\231\253\350\275\254\346\215\242\345\231\250.md" @@ -0,0 +1,95 @@ +# 爬虫转换器 + +etlpy针对爬虫进行了特定的优化,提供了一系列算子 + +## 访问网页数据:get,post + +`url.set('www.cnblogs.com').cp('_:content').get()` + +将先将url保存在url列,访问web,再将获取的content保存回p列。 + +如果想发送post请求,post数据可以以字典形式放在某列,作为参数传给post算子,如: + +`p.set({'a':1,'b':2}).url.set('www.baidu.com').post('[p]')` + +> 此处使用了括号表达式 + +代理的使用方法,参考本文最下方有关代理的相关内容 + + +## tree,xpath,pyq + +通过get,post方法获取的是原始的html文本,通过tree算子,可以将其加载为基于lxml的node节点。 + +之后,就可以通过xpath和pyquery获取html节点里的数据了。 + +例如: `url.set('www.cnblogs.com').tree().xpath('//div[2]')[0]` + +xpath, pyq算子能分别接受xpath表达式和jquery表达式,得到的结果是**满足条件的算子的节点数组**。 + +上面可以省略tree,因为xpath,pyq会检查输入的目标列的值,如果为字符串会自动解析为tree。 提前使用tree是为了满足从html中多次解析数据的需要,例如: + +`url.set('www.cnblogs.com').get().tree().cp('_:d1').xpath('//div[2]').cp('url:d2').xpath('//p[2]')` + +这样html只会被解析一次(获取的节点一直保存在url列中),避免被解析多次,因为解析比较消耗性能,因此尽量显式使用tree算子。 + + +> xpath和jquery语法请参考相关文档 + +> 如果tree的目标列已经是节点类型,则不做转换 + + +## html和text + +对于node节点,你可以调用html算子和text算子,分别获取节点的html或文本。因此,html()等价于dump('html'),tree()等价于load('html'),它们互为逆操作。 + +> 当输入列为其他类型时,text算子会对其执行str()操作。 + +一个常见的需求是,需要从网页中提取正文(新闻等),tree算子可以接受smart参数,自动返回包含最优全文的节点,之后可以将其转换为html或text: + +`...html.tree(smart=True).html()` + + + +## 自动嗅探和搜索 + +由于编写xpath依然费时费力,etlpy提供了智能的数据探测器,在tree算子之后,你可以使用detect算子,自动输出最佳的列表数据,并打印出对应的etlpy代码。例如: + +`url.set('http://www.cnblogs.com').get().tree().detect()` + +还可以在tree算子之后,执行search,参数为关键字,该算子会输出对应的xpath. + +## 下载文件 + +dl算子用于下载文件。目标列保存超链接,算子参数为要保存的路径,例如: + +`path.set('save_path').url.set('url').dl('[path]')` + +TODO: 如果要加入其它cookie,则。。。。 + + + +## 使用代理 + +使用代理可以防止爬虫被网页拦截,此乃杀人越货必备佳品。 + +在流中,可能会使用多个get/post算子去访问同一个网站,因此对每个get,post单独设置代理就变得非常繁琐了。因此可以在工程的环境(env)中设置代理管理器,该管理器会拦截get/post/dl算子的执行,为其注入代理的代码。 + +在默认工程中,可调用如下函数创建名为my_proxy的代理: + +``` +def set_proxy(name='my_proxy',proxy=None,header=None,delay=0.1,agent=True,timeout=20,allow_local=True): + pass +``` +其中proxy参数接受一个字符串数组作为列表: + +proxy= ['http://10.244.0.8:2398'] + +之后,所有proxy设置为my_proxy的web请求算子,都会自动被该代理拦截: + +`task()...get(proxy='my_proxy')....post(proxy='my_proxy')` + +当proxy数组不为空时,系统会会自动从该列表中随机选取代理去请求数据。 + +TODO: 研究如何传入多个命名参数 + diff --git "a/docs/2.4\350\277\207\346\273\244\345\222\214\346\216\222\345\272\217\345\231\250.md" "b/docs/2.4\350\277\207\346\273\244\345\222\214\346\216\222\345\272\217\345\231\250.md" new file mode 100644 index 0000000..85abbcc --- /dev/null +++ "b/docs/2.4\350\277\207\346\273\244\345\222\214\346\216\222\345\272\217\345\231\250.md" @@ -0,0 +1,71 @@ + +# 过滤和排序器 + +## 过滤器和where + +过滤器会保留所有满足条件的行,不满足条件的将会被过滤。 + +> 类似地,rm算子(这是转换器)可以过滤掉符合条件的列。 + +过滤器中,where最为常见,方法与map类似,例如: + +`task().p.range(10).where(lambda x:x>5)` + +等价于: + +`task().p.range(10).where('_>5')` #value值即为p列对应的值 + +如果你希望对结果求反(不满足条件的反而留下),则可以使用revert参数,上面的代码也等价于: + +`p.range(10).where('_<=4',revert=True)` + +若想针对整个数据,而非某一特定列,则可以: + +`...let('').where('p>5 && q<3')` #data宏表示整个字典。 + +> 宏参数的使用方法和map一致。 + +## 过滤空对象和字符串 + +notnull用于过滤空元素和空字符串。 + +match用于判断当前列的字符串是否匹配。默认为str匹配模式。如果需要正则表达式,则可以: + +`...match(regex='\d+')` + +## 过滤部分元素 + +> notin + +notin可以过滤那些**不在目标数组**里的元素,传入的参数可以为数组或元组。 + +`notin([1,2,3])` + +notin还能接受字符串参数,使用范围表达式: + +`notin('1:100')` + + + +## 排序器 + +ascend, descend用于排序,值得注意的是,排序器会破坏流的连续性(必须所有数据全部获得后才能排序,因此会有性能问题)。 + +使用方法和map,where完全一致。 + +示例代码很简单: + +`task().p.range(10).descend(lambda x:len(x))` + +或: + +`task().p.range(10).descend(len)` + + +## 一些有趣的讨论 + +其实可以做到下面的语法: + +`t=task().p.range[2:10:2].notin[2:10]` + +但为什么不这么用呢? 因为range是函数而非属性。这样做,就无法区分range到底是算子还是某个列的列名。为了统一性,还是放弃了这样的写法 \ No newline at end of file diff --git "a/docs/2.5\346\211\247\350\241\214\345\231\250\345\222\214\346\225\260\346\215\256\345\272\223.md" "b/docs/2.5\346\211\247\350\241\214\345\231\250\345\222\214\346\225\260\346\215\256\345\272\223.md" new file mode 100644 index 0000000..661129a --- /dev/null +++ "b/docs/2.5\346\211\247\350\241\214\345\231\250\345\222\214\346\225\260\346\215\256\345\272\223.md" @@ -0,0 +1,8 @@ + +# 执行器 + +执行器是特殊的转换器,区别在于只有处于执行模式时才会生效。etlpy明确区分调试模式和运行模式,因为执行器通常是具有副作用的操作(如访问web),只在执行时才会启动。 + + + + diff --git "a/docs/2.6\350\207\252\345\256\232\344\271\211\347\256\227\345\255\220.md" "b/docs/2.6\350\207\252\345\256\232\344\271\211\347\256\227\345\255\220.md" new file mode 100644 index 0000000..245f661 --- /dev/null +++ "b/docs/2.6\350\207\252\345\256\232\344\271\211\347\256\227\345\255\220.md" @@ -0,0 +1,50 @@ +# 自定义算子 + +一般情况下,用map和where就能满足绝大多数需求,如果某个算子对业务非常有用,则可以自定义算子。 + + +## 新的生成器 + +从Generator继承,下面是创建一个从p到q的区间数的生成器 + +``` +class MRange(Generator): + def __init__(self): + super(MRange, self).__init__() + self.p= 1 #任何算子都会有个默认参数p + self.q= 10 #设置默认值 + def generate(self,data,column): #要重载该函数 + # data参数用于传递当前生成器所在的数据,常用于cross模式,此处可以无视 + for i in range(p,q): + yield {column:i} +``` +调用时: + +`tasK().p.mrange(1,q=20)` + + +## 新的转换器 + +从Transformer继承, 写法类似生成器。 + +根据转换器类型的不同,要重载的函数也不同: + +| 类型 | 重载函数 | 构造函数中的参数 | +| -- | -- | -- | +| col |`def transform(self,data):` | `self.one_input= True` | +| data | `def transform(self, data, col,ncol):`|`self.one_input= False`| +| stream| ` def m_process(self,data,column):` | `self._m_process=False`| + +## 新的过滤器 + +从Filter继承, 写法类似生成器。 + +不论过滤器是哪种类型,都需重载`def filter(self,data)函数`,区别如下 + +| 类型 | filter函数中data的意义 | 构造函数中的参数 | +| -- | -- | -- | +| col | 对目标列过滤 | `self.one_input= True` | +| data | 对数据字典过滤 | `self.one_input= False`| + +> 有没有对流做过滤的过滤器呢?暂时没想到 + diff --git "a/docs/2.7\345\255\220\344\273\273\345\212\241.md" "b/docs/2.7\345\255\220\344\273\273\345\212\241.md" new file mode 100644 index 0000000..3bda14b --- /dev/null +++ "b/docs/2.7\345\255\220\344\273\273\345\212\241.md" @@ -0,0 +1,31 @@ +## 子任务 + +当流程设计的越来越复杂,越来越长时,就难以进行管理了。因此,采用模块化的设计才会更加合理。 + +子任务其实就是函数,可以定义输入参数和输出,把整个功能看成整体,从而方便重用。你可以设计多个任务,并将其组合起来,子任务也可以分别设计为转换,过滤,执行和生成器。值得指出,子流还可以调用其他的子流,形成树状的调用结构。 + +创建任务后,可以通过名字或实例调用子任务,有三种调用方式 + +| 调用形式 | 函数 | 注意 | +| -- | -- | -- | +| 生成器 | subge + +TODO: 生成器用map + + + +# 依赖环境 + +etlpy的运行需要一些环境,环境会随着算子向后传递,并传递给子任务。 + +列名是环境里最典型的例子: + +`task().create(datas).p.subge(sub_task)` + +则sub_task的起始列名就是p列。 + +如果子任务还调用了子任务,环境会接着向下传递,直到重新被设置了为止。 + +TODO: 如果subtf=map呢? 让map传递一个算子。执行器很鸡肋,可以化解。 + +debug, run模式。 \ No newline at end of file diff --git "a/docs/2.8 \345\217\202\346\225\260\347\263\273\347\273\237.md" "b/docs/2.8 \345\217\202\346\225\260\347\263\273\347\273\237.md" new file mode 100644 index 0000000..a52b6ad --- /dev/null +++ "b/docs/2.8 \345\217\202\346\225\260\347\263\273\347\273\237.md" @@ -0,0 +1,37 @@ +# param参数系统 + +参数系统是为了方便给算子传递参数,简化写法的一种机制,它本质上是一棵表达式树,能够被随意地组合拼接,求值后可生成纯POCO字典。 + +## 设计理由 + +对算子可能需要传参,若要传的参数较多,会影响可读性,若很多算子都使用相似的参数,那么就更难控制了: + +`t=task().get(url='123',cookie='xxx')...get(url='234',cookie='xxx',data=None)` + +因此,考虑将传给算子的参数重新包装,作为字典,那么参数传递就变得很容易了: +``` +p= Param(url='123',cookie='xxx') +`t=task().get(p)...get(p)` +``` + +## 几种自带的参数类型 + +### 最简单的Param + +``` +p= Param(url= 'abc',cookie='xxx') +# same as +p = Param({'url:'abc','cookie':'xxx'}) + +execute: p.eval() +{'url':'abc', 'cookie':'xxx'} +``` + +param can also be the value in mother param. + +> waring: rules cannot invoke themselves, otherwise leading to infinite recursion when evaluating. + + +### expression param + +expression param \ No newline at end of file diff --git "a/docs/3.\345\271\266\350\241\214\345\214\226.md" "b/docs/3.\345\271\266\350\241\214\345\214\226.md" new file mode 100644 index 0000000..24e4853 --- /dev/null +++ "b/docs/3.\345\271\266\350\241\214\345\214\226.md" @@ -0,0 +1,4 @@ + +# 并行化 + + diff --git "a/docs/4.\346\227\245\345\277\227\345\222\214\350\260\203\350\257\225.md" "b/docs/4.\346\227\245\345\277\227\345\222\214\350\260\203\350\257\225.md" new file mode 100644 index 0000000..9bbad9e --- /dev/null +++ "b/docs/4.\346\227\245\345\277\227\345\222\214\350\260\203\350\257\225.md" @@ -0,0 +1,37 @@ + +# 日志和调试 + +## 日志系统 + +etlpy使用了python标准的logging模块作为日志输出,因此可以按照需求将其重定向到任意位置和格式,并设定输出等级。 + +合理的日志很重要。你可以设置不同的日志等级: + +- >4: 对异常打印完整的信息堆栈 +- >5: 对每条数据的处理过程给出完整信息 +- >2: + + + +## 单步调试 + +函数式编程风格较难使用单步调试,etlpy提供了一系列方法解决这个问题。 + +### 在流中间插入调试算子 + +在流的任何位置,都可以插入如下算子,输出中间结果: + +插入count算子,查看输出的位置。 + +可插入tag算子,用于解释该位置的设计思路 + +插入`map(print)`或 `map(your_defined_function)`把中间结果打印或存储起来。 + +### 将一部分算子屏蔽 + +任何算子都接受enabled的可选参数,为true + + +### 按步骤单步调试 + +由于etlpy形成了一个调用链,因此常见的需求,是将后面的算子全部屏蔽,只执行部分 \ No newline at end of file diff --git "a/docs/5.pandas\351\233\206\346\210\220.md" "b/docs/5.pandas\351\233\206\346\210\220.md" new file mode 100644 index 0000000..88193ab --- /dev/null +++ "b/docs/5.pandas\351\233\206\346\210\220.md" @@ -0,0 +1,9 @@ +# pandas集成 + +读者可能注意到,etlpy不提供绝大多数的reduce操作,例如分组聚合等。原因有以下几点: + +1. 这会破坏流的连贯性(只有获取全部数据后才能执行后续操作,对性能影响巨大) +2. pandas已经有足够好用和强大的相关API + +与pandas的集成异常方便: + diff --git "a/docs/6.\344\273\243\347\240\201\350\257\264\346\230\216\345\222\214\345\216\237\347\220\206.md" "b/docs/6.\344\273\243\347\240\201\350\257\264\346\230\216\345\222\214\345\216\237\347\220\206.md" new file mode 100644 index 0000000..36eb68b --- /dev/null +++ "b/docs/6.\344\273\243\347\240\201\350\257\264\346\230\216\345\222\214\345\216\237\347\220\206.md" @@ -0,0 +1,37 @@ +# 代码分析和原理 + +etlpy的源代码非常简洁,只有三个py文件,核心etl.py只有1500行。 + +## 核心代码 + +etlpy的解释器会分析链式语法,并构建相应的计算图。 但只有真实迭代时,才会进行求值。 + +generate函数,会将多个tool拼接,组合成高阶函数,其实现如下: +``` +def generate(tools, generator=None, env=None): + ''' + evaluate a tool stream + :param tools: [ETLTool] + :param generator: seed generator for generator + :param init: bool, if initiate every tool + :param execute: bool, if enable executors + :param env: + :return: a generator + ''' + if env is None: + env = {'column': ''} + if tools is not None: + for tool in tools: + env= get_column(tool,env) + if isinstance(tool,LetTF): + continue + column= env['column'] + generator = tool.process(generator, column) #核心部分 + env['column']=env['next'] + if generator is None: + return [] + return generator +``` + +对每个tool进行遍历,在核心部分被组合成新的迭代器。环境会在构建函数时被传递和修改(目前环境仅维护了列名) + diff --git "a/docs/7.\350\241\245\345\205\205\350\257\264\346\230\216.md" "b/docs/7.\350\241\245\345\205\205\350\257\264\346\230\216.md" new file mode 100644 index 0000000..0de5a4e --- /dev/null +++ "b/docs/7.\350\241\245\345\205\205\350\257\264\346\230\216.md" @@ -0,0 +1,4 @@ + +# 补充说明 + +## 为何 diff --git "a/docs/8.\345\244\207\345\277\230\345\275\225.md" "b/docs/8.\345\244\207\345\277\230\345\275\225.md" new file mode 100644 index 0000000..c7ad14d --- /dev/null +++ "b/docs/8.\345\244\207\345\277\230\345\275\225.md" @@ -0,0 +1,6 @@ +> 为何要算子化? +- 可以将过程保存,并将其保存为中间文件为其他语言/平台读入 +- 可通过RNN等算法,从网页结构上自动生成这样的链式序列 + +缺点: +算子过分正交,不利于业务理解,之前的算子可能需要多个去组合 \ No newline at end of file diff --git a/docs/bug-fix.md b/docs/bug-fix.md new file mode 100644 index 0000000..b85f954 --- /dev/null +++ b/docs/bug-fix.md @@ -0,0 +1,219 @@ + + +## ok +生成器,range(1,1,1),返回默认值 ok + +增加mix方法 ok + + +def cross(a, gene_func): + for r1 in a: + r1=dict.copy(r1); + for r2 in gene_func(r1): + for key in r2: + r1[key] = r2[key] + yield dict.copy(r1); + +因为改了一下缩进,让yield往前退了一步,调试超过半个小时。。。 + +## 2016年10月08日 + +增加RotateTF ok, + +新建链家爬虫 + +### 需要解决 + +并行执行器在子流层面的问题? 子流内部不做任何并行化 OK + +### + +判断是否是字符串,这个东西花了多长的时间才暴露出来!!! merge_query!!! + +## 生成器 ok + +如果生成器返回的是字典数组,后期的每个操作都会改变这个字典的值,导致无法重复执行,显然要copy一下 + +一旦引擎发生改变,要修改的代码就很多。。。所以一定要稳定 + + + +## 多线程问题 ok! + +如何多线程,多机,并行,多进程等等。如果一次设计,多次运行的话,是不可能提前知道这些信息的。需要在运行时动态判定 + +支持动态判定,同时也支持在代码中实现,可以通过外部配置强制实现。 + +动态判定可能会有bug,之后考虑删除 + +因此,pl还是叫pl. query, 还有执行方式和线程数。 但这样参数就比较多了。 只包含是否回流数据回来的逻辑。 + +多线程,多进程可以一体化。产生多少个线程可以配置。传一个配置环境进去。 + +多进程无法传递生成器,但可以传递种子 + +## 多机分布式 ok! + +## 反爬虫机制 + +代理+ 延时。此时不能支持并行。提供代理池,之后每次访问,向前推一个。 + +全局访问器。 延时管理。 + +IP来源?买。。。 传一个数组进去 ok + + + + + +## 环境传递 ok + +链式操作,但实际上分为明显的编译时和运行时。编译时的参数如何传递给运行时呢 + +加入一个环境,只有当编译时才知道参数,比如column。 类似的,还能加入enabled,execute这些变量。当然都是隐含的。 + +最后一个问题,环境是self.env么?最好可以。否则会很麻烦。 + +缺点呢? 不太会有同时编译,但会有同时执行。那时已经编译完了 + +结论,不用self.env,而是用临时变量env + + +## 要不要做下去 ? ok + +做!沉没成本,但是已经花了这么多精力,放弃太过可惜。 +极端的linq... 一定要全部链路都这么设计么? + +## 增加缓存 ok! + +否则每次执行的速度特别慢。 + +- 要么增加全局缓存 +- 要么就是爬虫的缓存 爬虫缓存比较简单合理 + +考虑写文件,把url和参数对应的值存起来,除了web,其他也没什么太过耗费的。如果需要可以加 + +## UI设计 + +- 自己开发 +自动代码提示,参数提示,功能帮助,语法高亮,web化,所见即所得 +- anaconda +需增加缓存,体验不佳,但大家都比较清楚。 +先用anaconda吧 + +## enable的问题 + +不要做的那么反人类,不需要就删掉,没那么多废话。 + +需要把流拆开,然后组装,没必要把流程搞的那么长,流式语言也有设计模式。 + +可以在任意位置插入调试,之后的流被截断,这样就不用反复地注释了 ok +## 反爬虫逻辑 + +需要一个合适的机制,设置一个爬虫所需的环境 基本ok + +另外,流式语言一定要面向爬虫么? + +format体验不好,一定要{} 算了 +``` + +## 去掉不必要的库依赖 ok + +只在必要的时候再去安装相应的库s +flask +ipy_notebook +ipython +pyquery +simplejson : 如果只是dump的话,为何不用原生的json? +pandas: + + def process_req(self, args): + if self.delay != 0: + time.sleep(self.delay) + headers = self.headers + if headers not in ({}, ''): + if is_str(headers): + headers = para_to_dict(headers, '\n', ',') + if self.agent: + headers['User-Agent'] = random.choice(USER_AGENTS) + args['headers'] = headers + if self.proxy is not None and len(self.proxy) > 0: + l = len(self.proxy) - 1 + if self.allow_local == True: + l += 1 + index = random.randint(0, l) + if index < len(self.proxy): + proxy = self.proxy[index] + args['proxies'] = {'http': proxy} + + ``` + + + def proxy(port=8000): + from http_proxy import LoggingProxyHTTPHandler + import BaseHTTPServer + server_address = ('', port) + print('start proxy') + httpd = BaseHTTPServer.HTTPServer(server_address, LoggingProxyHTTPHandler) + + httpd.serve_forever() + + + +# 避免频繁地生成生成器 ok + +# 2017年10月17日 + +改进子流 ok +需要提供本地的并行存取方案 + +## 错误重试 + +目前还没有错误重试功能,应当记录 + +对执行的理解 + + +# 2017年10月31日 + +list().html().tree()特别繁琐,有可能简化掉吗? + +not作为where的前缀,实现求反。 + + +## etlpy的推广规划 + +1. hawk的github上,启动页面。xxx +2. 博客园 +3. ATA +4. 微信公众号 + +两篇: 纯技术向,设计思路向。  + +优先级: 低,收集反馈, 12月份完成即可 +估计github star量; 500,使用人数10K +文档: 目前完成度80%,中英文同时提供(笑帮忙翻译) + +将etlpy翻译为pandas文法,速度会大幅度提升 +用where语句解决逻辑问题 + +extend 可以通过脚本,将所需的脚本添加到后面去,相当于流的流 + +针对特征处理的算子,onehot,交叉组合,优选逻辑 + +生成sql + +tensor_provider, 发ATA上,会有很多人看 + +对join的优化 + +到处设计异常,看看程序能否正常运行,好的程序必须对异常做出处理 + +设计查看任务进展的webui + +rangetf... cross的,其实在后面叠加一个list就行了,若是字典再加drill。不过缺点是更难理解了。另外生成器后面默认也是能跟PL的 + + + + + diff --git a/etl.py b/etl.py deleted file mode 100644 index 47e21f8..0000000 --- a/etl.py +++ /dev/null @@ -1,975 +0,0 @@ -# coding=utf-8 -__author__ = 'zhaoyiming' -import re; -import extends -import urllib -import spider; -import json; -import html -import xml.etree.ElementTree as ET -import csv - -import os; - -intattrs = re.compile('Max|Min|Count|Index|Interval|Position'); -boolre = re.compile('^(One|Can|Is)|Enable|Should|Have|Revert'); -rescript = re.compile('Regex|Number') - - -def SetAttr(etl, key, value): - if key in ['Group','Type']: - return - - if intattrs.search(key) is not None: - try: - t = int(value); - setattr(etl, key, t); - except ValueError: - print('it is a ValueError') - setattr(etl, key, value); - elif boolre.search(key) is not None: - setattr(etl, key, True if value == 'True' else False); - else: - setattr(etl, key, value); - -def getMatchCount(mat): - return mat.lastindex if mat.lastindex is not None else 1; - -class ETLTool(extends.EObject): - def __init__(self): - self.Enabled=True; - self.Column = '' - def process(self, data): - return data - def init(self): - pass; - -class Transformer(ETLTool): - def __init__(self): - super(Transformer, self).__init__() - self.IsMultiYield=False - self.NewColumn=''; - self.OneOutput=True; - self.OneInput = False; - - def transform(self,data): - pass; - def process(self,data): - if self.IsMultiYield: # one to many - for r in data: - for p in self.transform( r): - yield extends.MergeQuery(p, r,self.NewColumn); - return; - for d in data: # one to one - if self.OneOutput: - if self.Column not in d or self.Column not in d: - yield d; - continue; - item = d[self.Column] if self.OneInput else d; - res = self.transform(item) - key= self.NewColumn if self.NewColumn!='' else self.Column; - d[key]=res; - else: - self.transform( d) - yield d; - -class Executor(ETLTool): - def execute(self,data): - pass; - def process(self,data): - for r in data: - self.execute(r); - yield r; - - -class Filter(ETLTool): - def __init__(self): - super(Filter, self).__init__() - self.Revert=False; - def filter(self,data): - - return True; - - def process(self, data): - for r in data: - item = None; - if self.Column in r: - item = r[self.Column]; - if item is None and self.__class__ != NullFT: - continue; - result = self.filter( item) - if result == True and self.Revert == False: - yield r; - elif result == False and self.Revert == True: - yield r; - -class Generator(ETLTool): - def __init__(self): - super(Generator, self).__init__() - self.MergeType='Append' - self.Position=0; - def generate(self,generator): - pass; - - def process(self, generator): - if generator is None: - return self.generate(None); - else: - if self.MergeType=='Append': - return extends.Append(generator,self.process(None)); - elif self.MergeType=='Merge': - return extends.Merge(generator, self.process(None)); - else: - return extends.Cross(generator,self.generate) - - - -class ConnectorBase(ETLTool): - def __init__(self): - super(ConnectorBase, self).__init__() - self.Connector = ''; - self.ExecuteType = 'OnlyInsert' - self.filetype = ''; - - def init(self): - self.connector= self.__proj__.connectors[self.Connector]; - if self.connector.TypeName=='MongoDBConnector': - import pymongo - client = pymongo.MongoClient(self.connector.ConnectString); - db = client[self.connector.DBName]; - self.Table = db[self.TableName]; - else: - path = self.TableName; - filetype = path.split('.')[-1].lower(); - encode = 'utf-8'; - self.file = open(path, type, encoding=encode) - self.filetype = filetype; - - -class DbEX(ConnectorBase): - def __init__(self): - super(DbEX, self).__init__() - self.TableName='' - - - - - def process(self,datas): - if self.connector.TypeName == 'MongoDBConnector': - etype = self.ExecuteType; - table = self.Table; - work = {'OnlyInsert': lambda d: table.save(d),'InsertOrUpdate':lambda d: table.save(d)}; - for data in datas: - work[etype](data); - yield data; - else: - - if self.filetype in ['csv', 'txt']: - field = extends.getkeys(datas); - self.writer = csv.DictWriter(self.file, field, delimiter=sp, lineterminator='\n') - self.writer.writeheader() - for data in datas: - self.writer.writerow(data); - yield data; - elif self.filetype == 'json': - self.file.write('[') - for data in datas: - json.dump(data, self.file, ensure_ascii=False) - self.file.write(','); - yield data; - self.file.write(']') - self.file.close(); - - -class DBGE(ConnectorBase): - - def generate(self,data): - if self.Connector=='MongoDBConnector': - for data in self.Table.find(): - yield data; - else: - if self.filetype in ['csv', 'txt']: - sp = ',' if self.filetype == 'csv' else '\t'; - reader = csv.DictReader(self.file, delimiter=sp) - for r in reader: - yield r; - elif self.filetype == 'json': - items = json.load(self.file); - for r in items: - yield r; - - def process(self, generator): - if generator is None: - return self.generate(None); - else: - if self.MergeType == 'Append': - return extends.Append(generator, self.process(None)); - elif self.MergeType == 'Merge': - return extends.Merge(generator, self.process(None)); - else: - return extends.Cross(generator, self.generate) - - -def setValue(data,etl,value): - if etl.NewColumn!='': - data[etl.NewColumn]=value; - else: - data[etl.Column]=value; - -class RegexFT(Filter): - - def init(self): - self.Regex = re.compile(self.Script); - self.Count=1; - - def filter(self,data): - v = self.Regex.findall(data); - if v is None: - return False; - else: - return self.Count <= len(v) - -class RangeFT(Filter): - - def filter(self,item): - f = float(item) - return self.Min <= f <= self.Max; - -class RepeatFT(Filter): - - def init(self): - self.set=set(); - def filter(self,data): - if data in self.set: - return False; - else: - self.set.add(data); - return True; - -class NullFT(Filter): - - def filter(self,data): - if data is None: - return False; - if isinstance(data, str): - return data.strip() != ''; - return True; - - -class AddNewTF(Transformer): - - def transform(self,data): - return self.NewValue; - - -class AutoIndexTF(Transformer): - def init(self): - super(AutoIndexTF, self).__init__() - self.currindex = 0; - def transform(self, data): - self.currindex += 1; - return self.currindex; - - -class RenameTF(Transformer): - - def __init__(self): - super(RenameTF, self).__init__() - self.OneOutput = False; - def transform(self, data): - if not self.Column in data: - return; - item = data[self.Column]; - del data[self.Column]; - if self.NewColumn != "": - data[self.NewColumn] = item; - -class DeleteTF(Transformer): - def __init__(self): - super(DeleteTF, self).__init__() - self.OneOutput = False; - def transform(self, data): - if self.Column in data: - del data[self.Column]; - -class HtmlTF(Transformer): - def __init__(self): - super(HtmlTF, self).__init__() - self.OneInput=True; - - def transform(self, data): - return html.escape(data) if self.ConvertType == 'Encode' else html.unescape(data); - - -class UrlTF(Transformer): - def __init__(self): - super(UrlTF, self).__init__() - self.OneInput = True; - def transform(self, data): - if self.ConvertType == 'Encode': - url = data.encode('utf-8'); - return urllib.parse.quote(url); - else: - return urllib.parse.unquote(data); - - -class RegexSplitTF(Transformer): - def transform(self, data): - items = re.split(self.Regex, data) - if len(items) <= self.Index: - return data; - if not self.FromBack: - return items[self.Index]; - else: - index = len(items) - self.Index - 1; - if index < 0: - return data; - else: - return items[index]; - return items[index]; - -class MergeTF(Transformer): - def __init__(self): - super(MergeTF, self).__init__() - self.Format='{0}' - self.MergeWith='' - def transform(self, data): - if self.MergeWith == '': - columns = []; - else: - columns = [str(data[r]) for r in self.MergeWith.split(' ')] - columns.insert(0, data[self.Column] if self.Column in data else ''); - res = self.Format; - for i in range(len(columns)): - res = res.replace('{' + str(i) + '}', str(columns[i])) - return res; - - - - -class RegexTF(Transformer): - def __init__(self): - super(RegexTF, self).__init__() - self.Script = ''; - self.OneInput = True; - - def init(self): - self.Regex = re.compile(self.Script); - def transform(self, data): - item = re.findall(self.Regex, str(data)); - if self.Index < 0: - return ''; - if len(item) <= self.Index: - return ''; - else: - r = item[self.Index]; - return r if isinstance(r, str) else r[0]; - -class ReReplaceTF(RegexTF): - - def transform(self, data): - return re.sub(self.Regex, self.ReplaceText, data); - -class NumberTF(RegexTF): - def __init__(self): - super(NumberTF, self).__init__() - self.Script='' #TODO - - def transform(self, data): - t = super(NumberTF,self).transform( data); - if t is not None and t != '': - return int(t); - return t; - -class SplitTF(Transformer): - def __init__(self): - super(SplitTF, self).__init__() - self.SplitChar=''; - self.OneInput = True; - - - def transform(self, data): - splits = self.SplitChar.split(' '); - sp = splits[0] - if sp == '': - return data; - - r = data.split(splits[0]); - if len(r) > self.Index: - return r[self.Index]; - return ''; - -class TrimTF(Transformer): - def __init__(self): - super(TrimTF, self).__init__() - self.OneInput = True; - - def transform(self, data): - return data.strip(); - -class StrExtractTF(Transformer): - def __init__(self): - super(StrExtractTF, self).__init__() - self.HaveStartEnd=False; - self.Start='' - self.OneInput=True; - self.End='' - - def transform(self, data): - start = data.find(self.Former); - if start == -1: - return - end = data.find(self.End, start); - if end == -1: - return; - if self.HaveStartEnd: - end += len(self.End); - if not self.HaveStartEnd: - start += len(self.Former); - return data[start:end]; - -class PythonTF(Transformer): - def __init__(self): - super(PythonTF, self).__init__() - self.OneOutput=False - self.Script='value' - self.ScriptWorkMode='不进行转换' - def transform(self, data): - result = eval(self.Script, {'value': data[self.Column]}, data); - if result is not None and self.IsMultiYield == False: - key = self.NewColumn if self.NewColumn != '' else self.Column; - data[key] = result; - return result; - -class CrawlerTF(Transformer): - def __init__(self): - super(CrawlerTF, self).__init__() - self.CrawlerSelector=''; - self.MaxTryCount=1; - self.IsRegex=False - self.OneOutput=False; - def init(self): - self.IsMultiYield = True; - self.crawler = self.__proj__.modules.get(self.CrawlerSelector, None); - self.buff = {}; - def transform(self, data): - crawler = self.crawler; - url = data[self.Column]; - buff = self.buff; - if url in buff: - datas = buff[url]; - else: - datas = crawler.CrawData(url); - if len(buff) < 100: - buff[url] = datas; - if self.crawler.IsMultiData == 'List': - for d in datas: - res = extends.MergeQuery(d, data, self.NewColumn); - yield res; - else: - data = extends.Merge(data, datas); - yield data; - - -class XPathTF(Transformer): - def __init__(self): - super(XPathTF, self).__init__() - self.XPath='' - self.IsMultiYield = True; - self.OneOutput=False; - - def init(self): - self.IsMultiYield=True; - self.OneOutput = False; - def transform(self, data): - from lxml import etree - if self.IsManyData: - tree = spider.GetHtmlTree(data[self.Column]); - nodes = tree.xpath(self.XPath); - for node in nodes: - ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; - ext['OHTML'] = ext['HTML'] - yield extends.MergeQuery(ext, data, self.NewColumn); - else: - tree = spider.GetHtmlTree(data[self.Column]); - nodes = tree.xpath(self.XPath); - node=nodes[0] - if hasattr(node,'text'): - setValue(data, self, node.text); - else: - setValue(data,self,str(node)) - yield data; - - -class ToListTF(Transformer): - def transform(self, data): - yield data; - -class JsonTF(Transformer): - def __init__(self): - super(JsonTF, self).__init__() - self.OneOutput=False - self.ScriptWorkMode='文档列表'; - - def init(self): - self.IsMultiYield= self.ScriptWorkMode=='文档列表'; - - def transform(self, data): - js = json.loads(data[self.Column]); - if isinstance(js, list): - for j in js: - yield j; - else: - yield js; - -class RangeGE(Generator): - def __init__(self): - super(RangeGE, self).__init__() - self.Interval='1' - self.MaxValue='1' - self.MinValue='1' - def generate(self,generator): - interval= int(extends.Query(generator,self.Interval)) - maxvalue= int(extends.Query(generator,self.MaxValue)) - minvalue= int(extends.Query(generator,self.MinValue)) - for i in range(minvalue,maxvalue,interval): - item= {self.Column:round(i,5)} - yield item; - -class RangeTF(Transformer): - def __init__(self): - super(RangeTF, self).__init__() - self.Skip=0; - self.Take=9999999; - def transform(self, data): - skip = int(extends.Query(data, self.Skip)); - take = int(extends.Query(data, self.Take)); - i = 0; - for r in data: - if i < skip: - continue; - if i >= take: - break; - i += 1; - yield r; - - -class EtlGE(Generator): - def generate(self,data): - subetl = self.__proj__.modules[self.ETLSelector]; - for r in generate(subetl.AllETLTools): - yield r; - -class EtlEX(Executor): - def execute(self,datas): - subetl = self.__proj__.modules[self.ETLSelector]; - for data in datas: - if spider.IsNone(self.NewColumn): - doc = data.copy(); - else: - doc = {}; - extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column); - result=(r for r in generate(subetl.AllETLTools, [doc])) - count=0; - for r in result: - count+=1; - print(r); - print(count) - yield data; - -class EtlTF(Transformer): - def transform(self,datas): - subetl = self.__proj__.modules[self.ETLSelector]; - if self.IsMultiYield: - - for data in datas: - doc = data.copy(); - for r in subetl.__generate__(subetl.AllETLTools, [doc]): - yield extends.MergeQuery(r, data, self.NewColumn); - else: - yield None; # TODO - - - -class TextGE(Generator): - def __init__(self): - super(TextGE, self).__init__() - self.Content=''; - def init(self): - self.arglists= [r.strip() for r in self.Content.split('\n')]; - def generate(self,data): - for i in range(self.Position, len(self.arglists)): - yield {self.Column: self.arglists[i]} - - - - - - -class TableEX(Executor): - def __init__(self): - super(TableEX, self).__init__() - self.Table = 'Table'; - def execute(self,data): - tables= self.__proj__.tables; - tname = self.Table; - if tname not in tables: - tables[tname] = []; - for r in data: - tables[tname].append(r); - yield r; - - - - - - - -class BaiduLocation(Transformer): - pass; - - -class GetIPLocation(Transformer): - pass; - -class GetRoute(Transformer): - pass; - -class NearbySearch(Transformer): - pass; - -class NlpTF(Transformer): - pass; - -class TransTF(Transformer): - pass; -class JoinDBTF(Transformer): - pass; - -class RepeatTF(Transformer): - pass; -class ResponseTF(Transformer): - pass; - -class Time2StrTF(Transformer): - pass; - - -class BfsGE(Generator): - pass; - -class DictTF(Transformer): - pass; - -class FileExistFT(Transformer): - def __init__(self): - super(FileExistFT,self).__init__(); - self.Script = ''; - self.OneInput = True; - def transform(self,data): - import os; - return str(os.path.exists(data)); - -class MergeRepeatTF(Transformer): - pass; - -class NumRangeFT(Filter): - pass; - -class DelayTF(Transformer): - pass; - -class ReadFileTextTF(Transformer): - pass; - -class WriteFileTextTF(Transformer): - pass; -class FolderGE(Generator): - pass; - -class TableGE(Generator): - pass; -class FileDataTF(Transformer): - pass; - - - -class SaveFileEX(Executor): - def __init__(self): - super(SaveFileEX, self).__init__() - self.SavePath=''; - - def execute(self,data): - - save_path = extends.Query(data, self.SavePath); - (folder,file)=os.path.split(save_path); - if not os.path.exists(folder): - os.makedirs(folder); - urllib.request.urlretrieve(data[self.Column], save_path) - - -def GetChildNode(roots, name): - for etool in roots: - if etool.get('Name') == name or etool.tag == name: - return etool; - return None; - - -def InitFromHttpItem(config, item): - httprib = config.attrib; - paras = spider.Para2Dict(httprib['Parameters'], '\n', ':'); - item.Headers = paras; - item.Url = httprib['URL']; - post = 'Postdata'; - if post in httprib: - item.postdata = httprib[post]; - else: - item.postdata = None; - - - - -class Project(extends.EObject): - def __init__(self): - self.modules={}; - self.tables={} - self.connectors={}; - self.__defaultdict__={}; - - -def LoadProject_dict(dic): - proj = Project(); - for key,connector in dic['connectors'].items(): - proj.connectors[key]= extends.dict_to_poco_type(connector); - for key,module in dic['modules'].items(): - task =None; - if 'AllETLTools' in module: - task = etl_factory(ETLTask(),proj); - for r in module['AllETLTools']: - etl= etl_factory(r['Type'],proj); - for attr,value in r.items(): - if attr in ['Type']: - continue; - setattr(etl,attr,value); - etl.__proj__=proj; - task.AllETLTools.append(etl) - elif 'CrawItems' in module: - task=etl_factory(spider.SmartCrawler(),proj); - task.CrawItems=[]; - extends.dict_copy_poco(task,module); - for r in module['CrawItems']: - crawlitem= etl_factory(spider.CrawItem(),proj) - extends.dict_copy_poco(crawlitem,r); - task.CrawItems.append(crawlitem) - task.HttpItem= etl_factory(spider.HTTPItem(),proj) - extends.dict_copy_poco(task.HttpItem,module['HttpItem']) - task.HttpItem.Headers=module['HttpItem']["Headers"]; - if task is not None: - proj.modules[key]=task; - - print('load project success') - return proj; - - -def task_DumpLinq(tools): - array=[]; - for t in tools: - typename= extends.get_type_name(t); - newcolumn=getattr(t,'NewColumn',''); - s='%s,%s'%(typename,t.Column); - s+='=>%s,'%newcolumn if newcolumn!='' else ','; - attrs=[]; - defaultdict= t.__proj__.__defaultdict__[typename]; - for att in t.__dict__: - value=t.__dict__[att]; - if att in ['NewColumn','Column','IsMultiYield']: - continue - if not isinstance(value,(str,int,bool,float)): - continue; - if value is None or att not in defaultdict or defaultdict[att]==value: - continue; - attrs.append('%s=%s'%(att,value)); - s+=','.join(attrs) - array.append(s) - return '\n'.join(array); - -def convert_dict(obj,defaultdict): - if not isinstance(obj, (str, int, float, list, dict, tuple, extends.EObject)): - return None -# if isinstance(obj,) - if isinstance(obj, extends.EObject): - d={} - typename = extends.get_type_name(obj); - - for key, value in obj.__dict__.items(): - if typename in defaultdict: - default = defaultdict[typename]; - if value== default.get(key,None): - continue; - if key.startswith('__'): - continue; - - p =convert_dict(value,defaultdict) - if p is not None: - d[key]=p - if isinstance(obj,ETLTool): - d['Type']= typename; - return d; - - elif isinstance(obj, list): - return [convert_dict(r,defaultdict) for r in obj]; - elif isinstance(obj,dict): - return {key: convert_dict(value,defaultdict) for key,value in obj.items()} - return obj; - - - - - return d - -def Project_DumpJson(proj): - dic= convert_dict(proj,proj.__defaultdict__) - return json.dumps(dic, ensure_ascii=False, indent=2) - - -def Project_LoadJson(js): - d=json.loads(js); - return LoadProject_dict(d) - -def etl_factory(item,proj): - if isinstance(item,str): - item=eval('%s()'%item); - else: - item=item; - import copy - name = extends.get_type_name(item) - if name not in proj.__defaultdict__: - proj.__defaultdict__[name]=copy.deepcopy( item.__dict__); - return item; - - -def Project_LoadXml(path): - tree = ET.parse(path); - proj=Project(); - def factory(obj): - return etl_factory(obj,proj); - root = tree.getroot(); - root = root.find('Doc'); - for etool in root: - if etool.tag == 'Children': - etype = etool.get('Type'); - name = etool.get('Name'); - if etype == 'SmartETLTool': - etltool = factory(ETLTask()); - for m in etool: - if m.tag == 'Children': - type= m.attrib['Type'] - etl = factory(type); - etl.__proj__=proj - for att in m.attrib: - SetAttr(etl, att, m.attrib[att]); - etltool.AllETLTools.append(etl); - proj.modules[name] = etltool; - elif etype == 'SmartCrawler': - import spider; - crawler =factory(spider.SmartCrawler()); - crawler.HttpItem= factory(spider.HTTPItem()) - crawler.Name = etool.attrib['Name']; - crawler.IsMultiData = etool.attrib['IsMultiData'] - crawler.RootXPath= etool.attrib['RootXPath'] - httpconfig = GetChildNode(etool, 'HttpSet'); - InitFromHttpItem(httpconfig, crawler.HttpItem); - login = GetChildNode(etool, 'Login'); - if login is not None: - crawler.Login = factory(spider.HTTPItem()); - InitFromHttpItem(login, crawler.Login); - crawler.CrawItems = []; - for child in etool: - if child.tag == 'Children': - crawitem= factory(spider.CrawItem()); - crawitem.Name=child.attrib['Name']; - crawitem.XPath = child.attrib['XPath']; - crawler.CrawItems.append(crawitem); - - proj.modules[name] = crawler; - elif etool.tag == 'DBConnections': - for tool in etool: - if tool.tag == 'Children': - connector = extends.EObject(); - for att in tool.attrib: - SetAttr(connector, att, tool.attrib[att]); - proj.connectors[connector.Name] = connector; - - print('load project success') - return proj; - - -def generate(tools, generator=None, execute=False, enabledFilter=True): - #print(task_DumpLinq(tools)); - for tool in tools: - if tool.Enabled == False and enabledFilter == True: - continue - tool.init(); - if isinstance(tool,Executor) and execute==False: - continue; - - generator = tool.process(generator) - return generator; - -def parallel_map(task, execute=True): - tools = task.AllETLTools; - index = extends.getindex(tools, lambda d: isinstance(d, ToListTF)); - if index == -1: - index = 0; - tool = tools[index]; - generator = tool.process(None); - else: - generator = generate(tools[:index],None, execute=execute); - return generator; - -def parallel_reduce(task,generator=None, execute=True): - tools = task.AllETLTools; - index = extends.getindex(tools, lambda d: isinstance(d,ToListTF)); - index =0 if index==-1 else index; - generator = generate(tools[index + 1:], generator, execute); - return generator; - - - - - - -class ETLTask(extends.EObject): - def __init__(self): - self.AllETLTools = []; - - - - def QueryDatas(self, etlCount=100, execute=False): - return generate((tool for tool in self.AllETLTools[:etlCount]), None, execute); - - def Close(self): - for tool in self.AllETLTools: - if tool.Type in ['DbGE', 'DbEX']: - if tool.connector.TypeName == 'FileManager': - if tool.filetype == 'json': - tool.file.write('{}]'); - tool.file.close(); - - - def mThreadExecute(self, threadcount=10,canexecute=True): - import threadpool - pool = threadpool.ThreadPool(threadcount) - - seed= parallel_map(self,canexecute); - def Funcs(item): - task= parallel_reduce(self,[item],canexecute); - print('totalcount: %d'%len([r for r in task])); - print('finish' + str(item)); - - requests = threadpool.makeRequests(Funcs, seed); - [pool.putRequest(req) for req in requests] - pool.wait() - # self.__close__() - - diff --git a/etlpy/__init__.py b/etlpy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etlpy/concurrence.py b/etlpy/concurrence.py new file mode 100644 index 0000000..a65d78c --- /dev/null +++ b/etlpy/concurrence.py @@ -0,0 +1,190 @@ +# coding=utf-8 +import codecs +import time, os, sys + +parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, parentdir) +import traceback +from multiprocessing.managers import BaseManager +import os, sys +from etlpy.extends import PY2, is_in_ipynb, convert_dict, group_by_mount +from etlpy.multi_yielder import NETWORK_MODE +from etlpy.tools import Project, parallel_map, ex_generate +import json +import pickle,dill +authkey = "etlpy".encode('utf-8') +timeout = 1 +rpc_port = 28998 + +if PY2: + from Queue import Queue +else: + from queue import Queue + +from flask import request, jsonify +from flask import Flask + +finished_job = [] +dispatched_job = [] +app = Flask(__name__) + + +class Master: + def __init__(self): + # 派发出去的作业队列 + self.dispatched_job_queue = Queue() + # 完成的作业队列 + self.finished_job_queue = Queue() + + def get_dispatched_job_queue(self): + return self.dispatched_job_queue + + def get_finished_job_queue(self): + return self.finished_job_queue + + def server_init(self, port=None): + if port is None: + port = rpc_port + # 把派发作业队列和完成作业队列注册到网络上 + BaseManager.register('get_dispatched_job_queue', callable=self.get_dispatched_job_queue) + BaseManager.register('get_finished_job_queue', callable=self.get_finished_job_queue) + print('job queue port is %d' % port) + # 监听端口和启动服务 + manager.start() + print('server started') + if is_in_ipynb(): + print('exec in ipython notebook') + + def start_server(self, port=6007): + self.server_init() + app.run(host='0.0.0.0', port=port, debug=True, use_reloader=False) + + def start_project(self, project, job_name, _rpc_port=None): + self.server_init(_rpc_port) + module = self.project.env[job_name] + dispatched_jobs = self.manager.get_dispatched_job_queue() + try: + while True: + while True: + for job in module.pl_geneator(): + dispatched_jobs.put(job) + if not extends.is_ipynb: + key = input('press any key to repeat,c to cancel') + if key == 'c': + break + manager.shutdown() + except Exception as e: + import traceback + traceback.print_exc() + print('manager has shutdown') + manager.shutdown() + + +@app.route('/task/query/', methods=['GET']) +def query_task(method): + job_queue = manager.get_finished_job_queue() + while not job_queue.empty(): + job = job_queue.get(60) + finished_job.append(job) + result = {"remain": manager.get_dispatched_job_queue().qsize()} + if method == 'finished': + result[method] = finished_job + elif method == 'dispatched': + result[method] = dispatched_job + elif method == 'clean': + for r in dispatched_job: + dispatched_job.remove(r) + result = jsonify(**result) + return result + + +@app.route('/task/insert', methods=['POST']) +def insert_task(): + job = request.json # have no the json data? + dispatched_job_queue = manager.get_dispatched_job_queue() + result = {"status": "success", "remain": dispatched_job_queue.qsize()} + # for r in manager.get_finished_job_queue(): + # if r['id'] == job['id'] and r['name'] == job['name']: + # result['status']='failed' + # result['status'] = 'repeat,ignore' + # print('task %s, id %s finished, so skip...'%(r['name'],r['id'])) + # return jsonify(**result) + dispatched_job_queue.put(job) + dispatched_job.append(job) + result = jsonify(**result) + return result + + +# 问题 +# 1. 传送任务一定要带project吗? 传,但不希望每次都传,分组可以搞大一点 + +class Slave: + def __init__(self): + # 派发出去的作业队列 + self.dispatched_job_queue = Queue() + # 完成的作业队列 + self.finished_job_queue = Queue() + + def start(self, execute=True, server_ip='127.0.0.1', port=rpc_port): + # 把派发作业队列和完成作业队列注册到网络上 + BaseManager.register('get_dispatched_job_queue') + BaseManager.register('get_finished_job_queue') + server = server_ip + print('Connect to server %s...' % server) + manager = BaseManager(address=(server, port), authkey=authkey) + manager.connect() + # 使用上面注册的方法获取队列 + dispatched_jobs = manager.get_dispatched_job_queue() + finished_jobs = manager.get_finished_job_queue() + + while True: + if dispatched_jobs.empty(): + print('task finished, delay 20s') + time.sleep(20) + continue + job = dispatched_jobs.get(timeout=timeout) + project, name, tasks, job_id, env = job['proj'], job['name'], job['tasks'], job['id'], job['env'] + print('Run job: %s ' % job_id) + + proj = dill.loads(codecs.decode(project.encode(), "base64")) + etl_task = proj.env[name] + total_count = 0 + task_result = dict(name=name, job_id=job_id) + mapper, reducer, tool = parallel_map(etl_task.tools, env) + try: + generator = ex_generate(reducer, tasks, env=env) + for item in generator: + total_count += 1 + task_result['count'] = total_count + task_result['message'] = 'success' + except Exception as e: + task_result['message'] = str(e) + traceback.print_exc() + print('finish job,id %s, count %s' % (job_id, total_count)) + finished_jobs.put(task_result) + + +## RPC控制端口和共享端口不是一个 + +if __name__ == '__main__': + manager = BaseManager(address=('0.0.0.0', rpc_port), authkey=authkey) + ip = '127.0.0.1' # '10.101.167.107' + + mode = 'client' + argv = sys.argv + port = 6067 + if len(argv) > 1: + mode = argv[1] + if mode == 'server': + if len(argv) > 2: + port = int(argv[2]) + master = Master() + master.start_server(port) + else: + port= rpc_port + if len(argv) > 2: + ip = argv[2] + if len(argv) > 3: + port = int(argv[3]) + slave = Slave() + slave.start(True, ip, port) diff --git a/etlpy/etlpy.py b/etlpy/etlpy.py new file mode 100644 index 0000000..254de59 --- /dev/null +++ b/etlpy/etlpy.py @@ -0,0 +1,165 @@ +# coding=utf-8 +import inspect + +from etlpy.extends import is_str, concat, to_list +from etlpy.tools import ETLTool, Transformer, Executor, Generator, SubBase, WebTF, Filter, Project, MongoDBConnector, \ + ETLTask + +from etlpy import tools, extends +__base_type = [ETLTool, Filter, WebTF, SubBase, Generator, Executor, Transformer] +__ignore_paras = ['one_input', 'multi', 'column', 'p'] + +tool_dict = {} +proj = Project() + + +def set_level(level): + extends.set_level(level) + + + +def __get_etl(dic): + for name, tool_type in tools.__dict__.items(): + if not inspect.isclass(tool_type): + continue + if not issubclass(tool_type, ETLTool): + continue + if tool_type in __base_type: + continue + dic[name] = tool_type + + +__get_etl(tool_dict) + +# +# def process_req(self, args): +# if self.delay != 0: +# time.sleep(self.delay) +# headers = self.headers +# if headers not in ({}, ''): +# if is_str(headers): +# headers = para_to_dict(headers, '\n', ',') +# if self.agent: +# headers['User-Agent'] = random.choice(USER_AGENTS) +# args['headers'] = headers +# if self.proxy is not None and len(self.proxy) > 0: +# l = len(self.proxy) - 1 +# if self.allow_local == True: +# l += 1 +# index = random.randint(0, l) +# if index < len(self.proxy): +# proxy = self.proxy[index] +# args['proxies'] = {'http': proxy} +# +# + + +def html(text): + from IPython.core.display import HTML, display + display(HTML(text)) + + +def get_default_connector(): + mongo = MongoDBConnector() + mongo.connect_str = 'mongodb://10.244.0.112' + mongo.db = 'ant_temp' + proj.env['mongo'] = mongo + return mongo + + +# def proxy(port=8000, LoggingProxyHTTPHandler=): +# from http_proxy import LoggingProxyHTTPHandler +# import BaseHTTPServer +# server_address = ('', port) +# print('start proxy') +# httpd = BaseHTTPServer.HTTPServer(server_address, LoggingProxyHTTPHandler) +# httpd.serve_forever() + + +def task(name='etl'): + _task = ETLTask() + _task._proj = proj + if name in proj.env: + name +='_'+str(len(proj.env.keys())) + _task.name = name + proj.env[name] = _task + + def attr_filler(attr): + if attr.startswith('_'): + return True + if attr in __ignore_paras: + return True + return False + + def set_attr(val, dic): + default = type(val)().__dict__ + for key in val.__dict__: + if key.startswith('_'): + continue + dv = default[key] + value = dic.get(key, dv) + if key == 'p' and is_str(value) and value == '': + continue + if value is not None: + setattr(val, key, value) + + def _rename(module): + repl = {'TF': '', 'Regex': 're', 'Parallel': 'pl', 'Remove': 'rm', 'Move': 'mv', 'Copy': 'cp'} + for k, v in repl.items(): + module = module.replace(k, v) + return module.lower() + + dynaimc_method = '''def __%s(p='',%s): + import etlpy.tools as etl + new_tool=etl.%s() + new_tool._proj=proj + _task.tools.append(new_tool) + set_attr(new_tool,locals()) + return _task + ''' + + def merge_func(k, v): + if is_str(v): + v = "'%s'" % (v) + yield '%s=%s' % (k, v) + + def etl_help(): + import inspect + for k, v in _task.__dict__.items(): + if inspect.isfunction(v): + doc = v.__doc__ + if doc is None: + doc = 'doc is invalid' + else: + doc = doc.split('\n') + for d in doc: + d = d.strip() + if d != '': + doc = d + break + print(k + ':\t' + doc) + + for name, tool_type in tools.__dict__.items(): + if not inspect.isclass(tool_type): + continue + if not issubclass(tool_type, ETLTool): + continue + if tool_type in __base_type: + continue + tool = tool_type() + paras = to_list(concat((merge_func(k, v) for k, v in tool.__dict__.items() if not attr_filler(k)))) + paras.sort() + paras = ','.join(paras) + new_name = _rename(name) + method_str = dynaimc_method % (new_name, paras, name) + locals()['proj'] = proj + exec(method_str, locals()) + func = locals()['__' + new_name] + func.__doc__ = tool.__doc__ + setattr(_task, new_name, func) + setattr(_task, 'help', etl_help) + return _task + + +if __name__ == '__main__': + pass diff --git a/etlpy/extends.py b/etlpy/extends.py new file mode 100644 index 0000000..9917d49 --- /dev/null +++ b/etlpy/extends.py @@ -0,0 +1,641 @@ +# encoding: UTF-8 +import multiprocessing +import re +import sys +import logging +import cgitb + +from ipy_progressbar import ProgressBar + +PY2 = sys.version_info[0] == 2 + +enable_progress = True + +if PY2: + import codecs + from Queue import Queue, Empty + + open = codecs.open +else: + open = open + from queue import Queue, Empty + +debug_level = 4 + + +def is_in_ipynb(): + try: + from IPython import get_ipython + cfg = get_ipython() + return cfg is not None + except NameError: + return False + + +def set_level(level): + debug_level = level + if level > 0: + cgitb.enable(format='text') + + +is_ipynb = is_in_ipynb() + + +def is_str(s): + if PY2: + if isinstance(s, (str, unicode)): + return True + else: + if isinstance(s, (str)): + return True + return False + + +def to_str(s): + if PY2 and isinstance(s, unicode): + return s + + try: + return str(s) + except Exception as e: + if PY2: + return unicode(s) + return 'to_str error:' + str(e) + + +def read_config(config): + if isinstance(config, dict): + new_config = Config() + for k, v in config.items(): + new_config[k] = read_config(v) + return new_config + elif isinstance(config, list): + for i in range(len(config)): + config[i] = read_config(config[i]) + return config + + +class Config(dict): + def __init__(self, dic=None): + if dic is not None: + self.read_config(dic) + + def read_config(self, config): + dic2 = read_config(config) + for k, v in dic2.items(): + self[k] = v + + def __getattr__(self, item): + if item not in self: + return None + return self[item] + + def __setattr__(self, key, value): + self[key] = value + + +def get_range_mount(generator, start=None, end=None, interval=1): + i = 0 + i2 = 0 + if interval == 0: + interval = 1 + if isinstance(generator, list): + generator = generator[start:end] + for r in generator: + yield r + else: + if start is None: + start = -1 + if end is None: + end = -1 + for r in generator: + i += 1 + if i < start + 1: + continue + if end > 0 and i > end: + break + i2 += 1 + if i2 % interval == 0: + yield r + + +def get_mount(generator, take=None, skip=0): + i = 0 + for r in generator: + i += 1 + if i < skip: + continue + if isinstance(take, int) and i > 0 and i > take + skip: + break + yield r + + +def foreach(generator, func): + for r in generator: + func(r) + yield r + + +def concat(generators): + for g in generators: + for r in g: + yield r + + +def to_list(generator, max_count=None): + datas = [] + count = 0 + for r in generator: + count += 1 + datas.append(r) + if max_count is not None and count >= max_count: + break + return datas + + +def progress_indicator(generator, title='Position Indicator', count=2000): + if not enable_progress: + for r in generator: + yield r + return + load = False + try: + # from ipy_progressbar import ProgressBar + # load=True + pass + except Exception as e: + p_expt(e) + if is_ipynb and load: + + generator = ProgressBar(generator, title=title) + generator.max = count + generator.start() + for data in generator: + yield data + generator.finish() + else: + id = 0 + for data in generator: + id += 1 + yield data + print('task finished') + + +def revert_invoke(item, funcs): + for i in range(0, len(funcs), -1): + item = funcs[i](item) + return item + + +def s_invoke(func, **param): + try: + if debug_level > 2: + logging.info('invoke' + str(func)) + return func(param) + except Exception as e: + p_expt(e) + + +def p_expt(e): + if debug_level >= 3: + logging.exception(e) + elif debug_level < 2 and debug_level > 0: + logging.error(e) + else: + pass + + +def collect(generator, format='print', paras=None): + if format == 'print' and not is_ipynb: + import pprint + for d in generator: + pprint.pprint(d) + return + elif format == 'keys': + for d in generator: + for k in paras: + print("%s: %s " % (k, d.get(k, 'None'))) + elif format == 'key': + import pprint + for d in generator: + pprint.pprint(d.keys()) + return + elif format == 'count': + count = 0 + for d in generator: + count += 1 + print('total count is ' + str(count)) + list_datas = to_list(progress_indicator(generator)) + if is_in_ipynb() or format == 'df': + from pandas import DataFrame + return DataFrame(list_datas) + else: + return list_datas + + +def format(form, keys): + res = form + for i in range(len(keys)): + res = res.replace('{' + to_str(i) + '}', to_str(keys[i])) + return res + + +def get_keys(generator, s): + count = 0 + for r in generator: + count += 1 + if count < 5: + for key in r.keys(): + if not key.startswith('_'): + try: + setattr(s, key, key) + except Exception as e: + pass + yield r + + +def repl_long_space(txt): + spacere = re.compile("[ ]{2,}") + spacern = re.compile("(^\r\n?)|(\r\n?$)") + r = spacere.subn(' ', txt)[0] + r = spacern.subn('', r)[0] + return r + + +def merge(d1, d2): + for r in d2: + d1[r] = d2[r] + return d1 + + +def conv_dict(dic, para_dic): + import copy + dic = copy.copy(dic) + for k, v in para_dic.items(): + if k == v: + continue + if k in dic: + dic[v] = dic[k] + del dic[k] + return dic + + +def replace_paras(item, old_value): + def get_short(v): + if v == '_': + return old_value + return v + + if isinstance(item, dict): + p = {} + for k, v in item.items(): + p[get_short(k)] = get_short(v) + return p + elif isinstance(item, list): + for i in range(len(item)): + item[i] = get_short(item[i]) + return item + return item + + +def para_to_dict(para, split1, split2): + r = {} + for s in para.split(split1): + s = s.strip() + rs = s.split(split2) + + key = rs[0].strip() + if len(rs) < 2: + value = key + else: + value = s[len(key) + 1:].strip() + if key == '': + continue + r[key] = value + return r + + +def split(string, char): + sp = string.split(char) + result = [] + for r in sp: + if r == '': + continue + result.append(r) + return result + + +def get_num(x, method=int, default=None): + if x in [None, '']: + return None + try: + return method(x) + except: + if default is None: + return x + return default + + +def merge_query(d1, d2, columns): + if is_str(columns) and columns.strip() != "": + if columns.find(":") > 0: + columns = para_to_dict(columns, ' ', ':') + else: + columns = columns.split(' ') + if columns is None: + return d1 + if isinstance(columns, list): + for r in columns: + if r in d2: + d1[r] = d2[r] + elif isinstance(columns, dict): + for k, v in columns.items(): + d1[v] = d2[k] + return d1 + + +import types + + +def tramp(gen, *args, **kwargs): + g = gen(*args, **kwargs) + while isinstance(g, types.GeneratorType): + g = g.next() + return g + + +import inspect + + +def is_iter(item): + if isinstance(item, list): + return True + if inspect.isgenerator(item): + return True + + +def first_or_default(generator): + for r in generator: + return r + return None + + +def query(data, key, default=None): + if data is None: + return key + if isinstance(data, dict): + if is_str(key) and key.startswith('[') and key.endswith(']'): + key = key[1:-1] + if key == '_': + key = default + if key in data: + return data[key] + else: + return None + return key + + +def get_value(data, key): + if key in ['', None]: + return data + if isinstance(data, dict): + return data.get(key, None) + else: + if hasattr(data, key): + return getattr(data, key) + return None + + +def set_value(data, key, value): + if key in ['', None]: + return data + if isinstance(data, dict): + data[key] = value + else: + setattr(data, key, value) + return data + + +def has(data, key): + if isinstance(data, dict): + return key in data + else: + return key in data.__dict__ + + +def del_value(data, key): + if key in ['', None]: + return + if isinstance(data, dict): + del data[key] + else: + del data.__dict__[key] + + +def get_variance(n_list): + sum1 = 0.0 + sum2 = 0.0 + N = len(n_list) + for i in range(N): + sum1 += n_list[i] + sum2 += n_list[i] ** 2 + mean = sum1 / N + var = sum2 / N - mean ** 2 + return var + + +def find_any(iter, filter): + for r in iter: + if filter(r): + return True + return False + + +def get_index(iter, filter): + for r in range(len(iter)): + if filter(iter[r]): + return r + return -1 + + +def get_indexs(iter, filter): + res = [] + for r in range(len(iter)): + if filter(iter[r]): + res.append(r) + return res + + +def cross(a, gene_func, env): + for r1 in a: + r1 = dict.copy(r1) + for r2 in gene_func(r1, env): + for key in r2: + r1[key] = r2[key] + yield dict.copy(r1) + + +def mix(g1, g2): + while True: + t1 = g1.next() + if t1 is None: + pass + else: + yield t1 + t2 = g2.next() + if t2 is None: + pass + else: + yield t2 + if t1 is None and t2 is None: + return + + +def cross_array(a, b, func): + for i in a: + for j in b: + yield func(i, j) + + +def merge_all(a, b): + while True: + t1 = a.__next__() + if t1 is None: + return + t2 = b.__next__() + if t2 is not None: + for t in t2: + t1[t] = t2[t] + yield t1 + + +def append(a, b): + for r in a: + yield r + for r in b: + yield r + + +def get_type_name(obj): + import inspect + if inspect.isclass(obj): + s = str(obj) + else: + s = str(obj.__class__) + p = s.find('.') + r = s[p + 1:].split('\'')[0] + r = r.replace('tools.', '') + return r + + +def copy(x): + if hasattr(x, 'copy'): + return x.copy() + return x + + +class EObject(object): + ''' + empty class, which mark a class to be a dict. + ''' + def __getstate__(self): + """Return state values to be pickled.""" + dic = {} + for k, v in self.__dict__.items(): + if k.startswith('__'): + continue + dic[k] = v + return dic + + def __setstate__(self, state): + """Restore state from the unpickled state values.""" + for k, v in state.items(): + self.__setattr__(k, v) + + +def get_range(range, env=None): + def get(key): + return query(env, key) + + buf = [r for r in range.split(':')] + start = 0 + end = interval = 1 + if len(buf) > 2: + interval = get_num(get(buf[2])) + if len(buf) > 1: + end = get_num(get(buf[1])) + if len(buf)>0: + start = get_num(get(buf[0])) + return start, end, interval + + +def convert_to_builtin_type(obj): + return {key: value for key, value in obj.__dict__.items() if + isinstance(value, (str, int, float, list, dict, tuple, EObject) or value is None)} + + +def dict_to_poco_type(obj): + if isinstance(obj, dict): + result = EObject() + for key in obj: + v = obj[key] + setattr(result, key, dict_to_poco_type(v)) + return result + elif isinstance(obj, list): + for i in range(len(obj)): + obj[i] = dict_to_poco_type(obj[i]) + return obj + + +def dict_copy_poco(obj, dic): + for key, value in obj.__dict__.items(): + if key in dic: + value = dic[key] + if isinstance(value, (int, float)) or is_str(value): + setattr(obj, key, value) + + +def convert_dict(obj): + if not isinstance(obj, (int, float, list, dict, tuple, EObject)) and not is_str(obj): + return None + if isinstance(obj, EObject): + d = {} + obj_type = type(obj) + typename = get_type_name(obj) + default = obj_type().__dict__ + if typename == 'ETLTask': + d['tools'] = convert_dict(obj.tools) + else: + for key, value in obj.__dict__.items(): + if value == default.get(key, None): + continue + if key.startswith('_'): + continue + p = convert_dict(value) + if p is not None: + d[key] = p + d['Type'] = typename + return d + + elif isinstance(obj, list): + return [convert_dict(r) for r in obj] + elif isinstance(obj, dict): + return {key: convert_dict(value) for key, value in obj.items()} + return obj + + +def group_by_mount(generator, group_count=10): + tasks = [] + task_id = 0 + if isinstance(generator, list): + generator = (r for r in generator) + while True: + task = next(generator, None) + if task is None: + yield tasks[:] + return + tasks.append(task) + if len(tasks) >= group_count: + yield tasks[:] + task_id = task_id + 1 + tasks = [] diff --git a/etlpy/multi_yielder.py b/etlpy/multi_yielder.py new file mode 100644 index 0000000..a285092 --- /dev/null +++ b/etlpy/multi_yielder.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +# coding=utf-8 +import codecs +import logging +import multiprocessing +import os + +from etlpy.extends import Queue,Empty + +NORMAL_MODE= 0 +THREAD_MODE = 1 +PROCESS_MODE =2 +ASYNC_MODE = 'async' +NETWORK_MODE =3 +DEFAULT_WORKER_NUM=20 + +open = codecs.open + + +class Stop(Exception): + "Exception raised by Queue.get(block=0)/get_nowait()." + pass + + +class Yielder(object): + def __init__(self, dispose): + self.dispose = dispose + + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + self.dispose() + + +def safe_queue_get(queue, is_stop_func=None, timeout=2): + while True: + if is_stop_func is not None and is_stop_func(): + return Stop + try: + data = queue.get(timeout=timeout) + return data + except Exception as e: + continue + + +def safe_queue_put(queue, item, is_stop_func=None, timeout=2): + while True: + if is_stop_func is not None and is_stop_func(): + return Stop + try: + queue.put(item, timeout=timeout) + return item + except Exception as e: + continue + + +def multi_yield(customer_func, mode=THREAD_MODE, worker_count=1, generator=None, queue_size=10): + workers = [] + + def is_alive(process): + if mode == PROCESS_MODE: + return process.is_alive() + elif mode == THREAD_MODE: + return process.isAlive() + return True + + class Stop_Wrapper(): + def __init__(self): + self.stop_flag = False + self.workers=[] + + def is_stop(self): + return self.stop_flag + + def stop(self): + self.stop_flag = True + for process in self.workers: + if isinstance(process,multiprocessing.Process): + process.terminate() + + stop_wrapper = Stop_Wrapper() + + def _boss(task_generator, task_queue, worker_count): + for task in task_generator: + item = safe_queue_put(task_queue, task, stop_wrapper.is_stop) + if item is Stop: + return + for i in range(worker_count): + task_queue.put(Empty) + + def _worker(task_queue, result_queue, gene_func): + import time + try: + def generator(): + while not stop_wrapper.is_stop(): + if task_queue.empty(): + time.sleep(0.01) + continue + task = safe_queue_get(task_queue, stop_wrapper.is_stop) + if task == Empty: + result_queue.put(Empty) + break + if task == Stop: + break + yield task + for item in gene_func(generator()): + item = safe_queue_put(result_queue, item, stop_wrapper.is_stop) + if item == Stop: + break + except Exception as e: + logging.exception(e) + + def factory(func, args=None, name='task'): + if args is None: + args = () + if mode == PROCESS_MODE: + return multiprocessing.Process(name=name, target=func, args=args) + if mode == THREAD_MODE: + import threading + t = threading.Thread(name=name, target=func, args=args) + t.daemon = True + return t + if mode == ASYNC_MODE: + import gevent + return gevent.spawn(func, *args) + + def queue_factory(size): + if mode == PROCESS_MODE: + return multiprocessing.Queue(size) + elif mode == THREAD_MODE: + return Queue(size) + elif mode == ASYNC_MODE: + from gevent import queue + return queue.Queue(size) + + def should_stop(): + if not any([r for r in workers if is_alive(r)]) and result_queue.empty(): + return True + return stop_wrapper.is_stop() + + if mode is None or mode == NORMAL_MODE: + for item in customer_func(generator): + yield item + return + with Yielder(stop_wrapper.stop): + result_queue = queue_factory(queue_size) + task_queue = queue_factory(queue_size) + + main = factory(_boss, args=(generator, task_queue, worker_count), name='_boss') + for process_id in range(0, worker_count): + name = 'worker_%s' % (process_id) + p = factory(_worker, args=(task_queue, result_queue, customer_func), name=name) + workers.append(p) + main.start() + stop_wrapper.workers = workers[:] + stop_wrapper.workers.append(main) + for r in workers: + r.start() + count = 0 + while not should_stop(): + data = safe_queue_get(result_queue, should_stop) + if data is Empty: + count += 1 + if count == worker_count: + break + continue + if data is Stop: + break + else: + yield data + + +def get_split(datas, count, index): + l = len(datas) + assert index < count + if count > l: + count = l + seg = l / count + end = l if index == count - 1 else seg * (index + 1) + data = datas[seg * index: end] + return data + + +def exec_cmd(exec_str): + os.system(exec_str) + + diff --git a/etlpy/params.py b/etlpy/params.py new file mode 100644 index 0000000..5ea1a30 --- /dev/null +++ b/etlpy/params.py @@ -0,0 +1,74 @@ +import inspect +import random +import types + +from etlpy.extends import EObject, query +from etlpy.proxy import USER_AGENTS + + +class Param(EObject): + def __init__(self, value=None, **kwargs): + super(Param, self).__init__() + if value is not None: + self.value = value + else: + self.value= kwargs + + def __str__(self): + if isinstance(self.value, dict): + return 'Param{' +' , '.join('%s:%s'%(k,v) for k,v in self.value.items()) + '}' + return 'Param{'+str(type(self.value)) + '}' + + + def __setitem__(self, instance, value): + assert isinstance(self.value, dict) + self.value[instance]=value + + def get(self, data,col): + values = self.value.copy() + if isinstance(values, dict): + for k, v in values.items(): + if issubclass(type(v), Param): + values[k] = v.get(data,col) + if inspect.isfunction(v): + values[k] = v(data) + elif inspect.isfunction(values) or isinstance(values, types.LambdaType): + values= values() + return values + + def merge_all(self, item): + assert issubclass(type(item),Param) or isinstance(item,dict) + dic3 = self.value.copy() + value = item if isinstance(item,dict) else item.value + for k, v in value.items(): + dic3[k] = v + return Param(dic3) + + def merge(self, key, item): + dic3 = self.value.copy() + if key in dic3: + dic3[key] = dic3[key].merge_all(item) + else: + dic3[key]= item + return Param(dic3) + + def copy(self): + return Param(self.value.copy()) + + +class ExpParam(Param): + def get(self, data,col): + if inspect.isfunction(self.value): + return self.value(data) + return query(data, self.value,col) + + +class RandomParam(Param): + def get(self, data,col): + return random.choice(self.value) + + +request_param = Param({'url': ExpParam('[_]'), 'headers': Param({'user_agents': USER_AGENTS[-2]})}) + +if __name__ == '__main__': + print(request_param.get({})) \ No newline at end of file diff --git a/etlpy/pickledb.py b/etlpy/pickledb.py new file mode 100644 index 0000000..d3c7fb8 --- /dev/null +++ b/etlpy/pickledb.py @@ -0,0 +1,182 @@ + +import os +import simplejson + +def load(location, option): + '''Return a pickledb object. location is the path to the json file.''' + return pickledb(location, option) + +class pickledb(object): + + def __init__(self, location, option): + '''Creates a database object and loads the data from the location path. + If the file does not exist it will be created on the first update.''' + self.load(location, option) + + def load(self, location, option): + '''Loads, reloads or changes the path to the db file.''' + location = os.path.expanduser(location) + self.loco = location + self.fsave = option + if os.path.exists(location): + try: + self._loaddb() + except Exception as e: + print(e) + self.db={} + + else: + self.db = {} + return True + + def dump(self): + '''Force dump memory db to file.''' + self._dumpdb(True) + return True + + def set(self, key, value): + '''Set the (string,int,whatever) value of a key''' + self.db[key] = value + self._dumpdb(self.fsave) + return True + + def get(self, key,default=None): + '''Get the value of a key''' + try: + return self.db.get(key,default) + except KeyError: + return None + + def size(self): + return len(self.db.keys()) + + def getall(self): + '''Return a list of all keys in db''' + return self.db.keys() + + def rem(self, key): + '''Delete a key''' + del self.db[key] + self._dumpdb(self.fsave) + return True + + def lcreate(self, name): + '''Create a list''' + self.db[name] = [] + self._dumpdb(self.fsave) + return True + + def ladd(self, name, value): + '''Add a value to a list''' + self.db[name].append(value) + self._dumpdb(self.fsave) + return True + + def lextend(self, name, seq): + '''Extend a list with a sequence''' + self.db[name].extend(seq) + self._dumpdb(self.fsave) + return True + + def lgetall(self, name): + '''Return all values in a list''' + return self.db[name] + + def lget(self, name, pos): + '''Return one value in a list''' + return self.db[name][pos] + + def lrem(self, name): + '''Remove a list and all of its values''' + number = len(self.db[name]) + del self.db[name] + self._dumpdb(self.fsave) + return number + + def lpop(self, name, pos): + '''Remove one value in a list''' + value = self.db[name][pos] + del self.db[name][pos] + self._dumpdb(self.fsave) + return value + + def llen(self, name): + '''Returns the length of the list''' + return len(self.db[name]) + + def append(self, key, more): + '''Add more to a key's value''' + tmp = self.db[key] + self.db[key] = ('%s%s' % (tmp, more)) + self._dumpdb(self.fsave) + return True + + def lappend(self, name, pos, more): + '''Add more to a value in a list''' + tmp = self.db[name][pos] + self.db[name][pos] = ('%s%s' % (tmp, more)) + self._dumpdb(self.fsave) + return True + + def dcreate(self, name): + '''Create a dict''' + self.db[name] = {} + self._dumpdb(self.fsave) + return True + + def dadd(self, name, pair): + '''Add a key-value pair to a dict, "pair" is a tuple''' + self.db[name][pair[0]] = pair[1] + self._dumpdb(self.fsave) + return True + + def dget(self, name, key): + '''Return the value for a key in a dict''' + return self.db[name][key] + + def dgetall(self, name): + '''Return all key-value pairs from a dict''' + return self.db[name] + + def drem(self, name): + '''Remove a dict and all of its pairs''' + del self.db[name] + self._dumpdb(self.fsave) + return True + + def dpop(self, name, key): + '''Remove one key-value pair in a dict''' + value = self.db[name][key] + del self.db[name][key] + self._dumpdb(self.fsave) + return value + + def dkeys(self, name): + '''Return all the keys for a dict''' + return self.db[name].keys() + + def dvals(self, name): + '''Return all the values for a dict''' + return self.db[name].values() + + def dexists(self, name, key): + '''Determine if a key exists or not''' + if self.db[name][key] is not None: + return 1 + else: + return 0 + + def deldb(self): + '''Delete everything from the database''' + self.db= {} + self._dumpdb(self.fsave) + return True + + def _loaddb(self): + '''Load or reload the json info from the file''' + self.db = simplejson.load(open(self.loco, 'rb')) + + def _dumpdb(self, forced): + '''Write/save the json dump into the file''' + if forced: + simplejson.dump(self.db, open(self.loco, 'wt')) \ No newline at end of file diff --git a/etlpy/proxy.py b/etlpy/proxy.py new file mode 100644 index 0000000..767f17b --- /dev/null +++ b/etlpy/proxy.py @@ -0,0 +1,145 @@ +# coding=utf-8 +import json + +# if PY2: +# import StringIO +# from BaseHTTPServer import BaseHTTPRequestHandler +# else: +# from http.server import BaseHTTPRequestHandler, HTTPServer +import gzip +import requests +import time + +from etlpy.spider import get_encoding + +proxy_url='http://123.207.35.36:5010' + +def get_proxy(): + return requests.get(proxy_url+"/get/").content + +def get_proxy_all(): + url= requests.get(proxy_url+"/get_all/").json() + return url + +def delete_proxy(proxy): + requests.get(proxy_url+"/delete/?proxy={}".format(proxy)) + + + +USER_AGENTS = [ + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", + "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", + "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", + "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", + "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", +] + + + + + + +""" +usage 'pinhole port host [newport]' + +Pinhole forwards the port to the host specified. +The optional newport parameter may be used to +redirect to a different port. + +eg. pinhole 80 webserver + Forward all incoming WWW sessions to webserver. + + pinhole 23 localhost 2323 + Forward all telnet sessions to port 2323 on localhost. +""" + +import sys +from socket import * +from threading import Thread +import time + +LOGGING = 1 + +def log(s): + if LOGGING: + print + '%s:%s' % (time.ctime(), s) + sys.stdout.flush() + + +class PipeThread(Thread): + pipes = [] + + def __init__(self, source, sink): + Thread.__init__(self) + self.source = source + self.sink = sink + log('Creating new pipe thread %s ( %s -> %s )' % \ + (self, source.getpeername(), sink.getpeername())) + PipeThread.pipes.append(self) + log('%s pipes active' % len(PipeThread.pipes)) + + def run(self): + while 1: + try: + data = self.source.recv(1024) + if not data: break + self.sink.send(data) + except: + break + + log('%s terminating' % self) + PipeThread.pipes.remove(self) + log('%s pipes active' % len(PipeThread.pipes)) + + +class Pinhole(Thread): + def __init__(self, port, newhost, newport): + Thread.__init__(self) + log('Redirecting: localhost:%s -> %s:%s' % (port, newhost, newport)) + self.newhost = newhost + self.newport = newport + self.sock = socket(AF_INET, SOCK_STREAM) + self.sock.bind(('', port)) + self.sock.listen(5) + + def run(self): + while 1: + newsock, address = self.sock.accept() + log('Creating new session for %s %s ' % address) + fwd = socket(AF_INET, SOCK_STREAM) + fwd.connect((self.newhost, self.newport)) + PipeThread(newsock, fwd).start() + PipeThread(fwd, newsock).start() + + +if __name__ == '__main__': + + print('Starting Pinhole') + + import sys + + sys.stdout = open('pinhole.log', 'w') + + if len(sys.argv) > 1: + port = newport = int(sys.argv[1]) + newhost = sys.argv[2] + if len(sys.argv) == 4: newport = int(sys.argv[3]) + Pinhole(port, newhost, newport).start() + else: + Pinhole(80, 'hydrogen', 80).start() + Pinhole(23, 'hydrogen', 23).start() + + + diff --git a/etlpy/spider.py b/etlpy/spider.py new file mode 100644 index 0000000..453ee87 --- /dev/null +++ b/etlpy/spider.py @@ -0,0 +1,559 @@ +# coding=utf-8 + +import sys; +import re +import requests +from lxml import etree +from itertools import groupby +from etlpy.extends import EObject, to_str, PY2, get_variance, is_str +box_regex = re.compile(r"\[\d{1,3}\]"); +agent_list = [] + + +class XPath(EObject): + def __init__(self, name=None, xpath=None, is_html=False, sample=None, must=False): + self.path = xpath; + self.sample = sample; + self.name = name; + self.must = must; + self.is_html = is_html; + self.children = []; + + def __str__(self): + return "%s %s %s" % (self.name, self.path, self.sample); + + +def xpath_rm_last_num(paths): + v = paths[-1]; + m = box_regex.search(v); + if m is not None: + s = m.group(0); + paths[-1] = v.replace(s, ""); + return '/'.join(paths); + + +def get_common_xpath(xpaths): + paths = [r.path.split('/') for r in xpaths]; + minlen = min(len(r) for r in paths); + c = None; + for i in range(minlen): + for index in range(len(paths)): + path = paths[index]; + if index == 0: + c = path[i]; + elif c != path[i]: + first = path[0:i + 1]; + return xpath_rm_last_num(first); + + +def xpath_take_off(path, root_path): + r = path.replace(root_path, ''); + if r.startswith('['): + r = '/'.join(r.split('/')[1:]) + return r + + +def xpath_iter_sub(path): + xps = path.split('/'); + for i in range(2, len(xps)): + xp = xpath_rm_last_num(xps[:i]) + yield xp; + + +attrsplit = re.compile('@|\['); + + +def get_xpath_data(node, path, is_html=False, only_one=True): + p = node.xpath(path); + if p is None: + return None; + if len(p) == 0: + return None; + paths = path.split('/'); + last = paths[-1]; + attr = False; + if last.find('@') >= 0: # and last.find('[1]')>=0: + attr = True; + results = []; + + def get(x): + if attr: + return to_str(x) + elif is_html: + return etree.tostring(x).decode('utf-8') + else: + return get_node_text(x); + + for n in p: + result = get(n) + if only_one: + return result; + results.append(result); + return results; + + +_extract = re.compile('\[(\w+)\]'); +_charset = re.compile(']*?charset="?(\\w+)[\\W]*?>'); +_charset = re.compile('charset="?([A-Za-z0-9-]+)"?>'); + +default_encodings = ['utf-8', 'gbk']; + + +def get_encoding(html): + encoding = _charset.search(to_str(html)) + if encoding is not None: + encoding = encoding.group(1); + if encoding is None: + encoding = 'utf-8' + except_encoding = encoding + try: + result = html.decode(except_encoding) + return result; + except UnicodeDecodeError as e: + pass; + + for en in default_encodings: + if en == except_encoding: + continue; + try: + result = html.decode(en) + return result; + except UnicodeDecodeError as e: + continue; + sys.stderr.write(str(e) + '\n'); + import chardet + en = chardet.detect(html)['encoding'] + result = html.decode(en, errors='ignore'); + return result + + +def get_html(url): + r = requests.get(url) + return r.text; + + +def is_none(data): + return data is None or data == ''; + + +def __get_node_text(node, array): + if hasattr(node, 'tag') and isinstance(node.tag, str) and node.tag.lower() not in ['script', 'style', 'comment']: + t = node.text; + if t is not None: + t = t.strip() + if t != '': + array.append(t) + t = node.tail; + if t is not None: + t = t.strip() + if t != '': + array.append(t) + for sub in node.iterchildren(): + __get_node_text(sub, array) + + +def get_node_text(node): + if node is None: + return "" + array = []; + __get_node_text(node, array); + return ' '.join(array); + + +def get_node_html(node): + if node is None: + return "" + if str(type(node)).lower().find('str') > 0: + return str(node) + else: + return etree.tostring(node).decode('utf-8'); + + +if PY2: + pass +else: + import html.parser as h + + html_parser = h.HTMLParser() + +PM25 = 2.4; + +ignoretag = re.compile('script|style'); +boxRegex = re.compile(r"\[\d{1,3}\]"); + + +def str_match(text, keyword, match_func): + if text is None: return 0; + keyword = keyword.strip(); + items = [r.strip() for r in re.split('\t|\r|\n', text)]; + for r in items: + if match_func(r, keyword): + return 1; + return 0; + + +def get_node_leaf_count(node): + count = 0; + if node is None: return; + nodes = [r for r in node.iterchildren()]; + c = len(nodes); + if c == 0: count += 1; + for node in nodes: + count += get_node_leaf_count(node); + return count; + + +def _is_same_path(p1, p2, root_path): + path1 = p1.replace(root_path, ''); + path2 = p2.replace(root_path, ''); + return str(path1) == str(path2); + + +def get_diff_page(htmls, has_attr): + trees = [] + nodes = []; + for html in htmls: + root = etree.HTML(html); + tree = etree.ElementTree(root); + nodes.append(root); + trees.append(tree); + xpaths = []; + __get_diff_nodes(trees, nodes, xpaths, has_attr); + return xpaths; + + +def search_text_root(tree, node): + class ParaClass(object): + def __init__(self): + self.tlen = 0; + self.node = '' + + para = ParaClass(); + __search_text_root(tree, None, node, para); + return para.node + + +def __search_text_root(tree, father, node, para): + if hasattr(node, 'tag') and isinstance(node.tag, str) and node.tag.lower() not in ['script', 'style', 'comment']: + child_nodes = [n for n in node.iterchildren()]; + if len(child_nodes) > 0: + for child in child_nodes: + __search_text_root(tree, node, child, para); + else: + text = node.text; + if text == None: + return; + tlen = len(text) + if tlen > para.tlen: + para.tlen = tlen; + para.node = father + + +def __get_sub_xpath(path, slice): + r = path.split('/'); + paths = slice(r) + return '/'.join(paths) + + +def __search_table_root(tree, nodes, path_dict, has_child, strict=True): + if strict: + variance_max = 2 + else: + variance_max = 10 + + if nodes is None: + return; + if len(nodes) == 0: + return None; + node = nodes[0]; + if has_child: + for node in nodes: + all_childs = [child for child in node.iterchildren() if str(child.__class__).find('Element') > 0] + childs = groupby(all_childs, key=lambda node: node.tag); + for key, node_group in childs: + node_group = list(node_group); + __search_table_root(tree, node_group, path_dict, has_child, strict); + target_node = list(node for node in nodes); # filter(lambda x:not x.tag.startswith("#"),nodes); + child_count = float(len(target_node)); + if 5 > child_count: + return; + same_name_count = len([x for x in target_node if x.tag == target_node[1].tag]); + if same_name_count < child_count * 0.7: return; + child_counts = []; + for n in target_node: + child_counts.append(len(list(r for r in n.iterchildren()))); + variance = get_variance(child_counts); + if variance > variance_max: return; + leaf_count = get_node_leaf_count(target_node[0]); + if leaf_count < 2: + return; + value = child_count * PM25 + leaf_count; + xpath = xpath_rm_last_num(tree.getpath(node).split('/')); + path_dict[xpath] = value; + + +def search_table_root(root, has_child=True): + d = {}; + tree = etree.ElementTree(root); + __search_table_root(tree, root, d, has_child); + return d; + + +def _str_find(string, word): + return string.find(word) >= 0; + + +def _regex_find(string, regex): + res = re.match(regex, string); + if res: + return res; + return res; + + +def _tn_find(string, rule): + from tn import core + return core.match(string, rule) is not None; + + +def search_xpath(node, keyword, match_func='str', has_attr=False): + tree = etree.ElementTree(node); + dics = {'str': _str_find, 'tn': _tn_find, 'script': _regex_find}; + return __search_xpath(tree, node, keyword, dics[match_func], has_attr); + + +def __search_xpath(tree, node, keyword, match_func, has_attr=False): + if node is None or keyword is None: return; + nodes = node.iterchildren(); + for node in nodes: + if str(node.__class__).find("Element") > 0: + path = __search_xpath(tree, node, keyword, match_func, has_attr); + if path is not None: + return path; + if node.text is not None and str_match(node.text, keyword, match_func): + xpath = tree.getpath(node) + return xpath; + if has_attr: + for r in node.attrib: + if str_match(node.attrib[r], keyword, match_func): + xpath = tree.getpath(node) + '/@%s[1]' % (r) + return xpath; + return None + + +def __get_nearest_node(targets, node): + dic = {}; + for target in targets: + if type(target) != type(node): + continue; + if target.tag != node.tag: + continue; + dis = len(target.attrib.keys()) - len(node.attrib.keys()) + dis = abs(dis); + dis += abs(get_node_leaf_count(target) - get_node_leaf_count(node)) + dic[target] = dis; + minv = 99999; + selected_node = None; + for k, v in dic.items(): + if v < minv: + selected_node = k; + return selected_node; + + +def __get_diff_nodes(trees, nodes, xpaths, has_attr): + def get_tree(i): + if isinstance(trees, list): + return trees[i]; + return trees; + + is_child_contain_info = False; + index = int(len(nodes) / 2); + node1 = nodes[index] + tree1 = etree.ElementTree(node1); + node1path = get_tree(0).getpath(node1); + for child_node1 in node1.iterchildren(): + path = '/'.join(tree1.getpath(child_node1).split('/')[2:]) + node_child2 = []; + for node in nodes: + targets = node.xpath(path); + if len(targets) == 0: + continue # TODO: this is fucked + node_child2.append(targets[0]); + if len(node_child2) <= 1: + continue; + is_child_contain_info |= __get_diff_nodes(trees, node_child2, xpaths, has_attr); + if is_child_contain_info == False: + for i in range(0, len(nodes)): + node = nodes[i]; + if not __is_same_string(node.text, node1.text): + prop_name = __search_node_name(node, xpaths); + xpath = XPath(prop_name) + xpath.sample = node1.text; + xpath.path = node1path if len(xpaths) % 2 == 0 else get_tree(i).getpath(node); + xpaths.append(xpath); + is_child_contain_info = True; + break; + if not has_attr: + return is_child_contain_info; + for r in node1.attrib: + v = node1.attrib[r]; + for i in range(0, len(nodes)): + node = nodes[i]; + value = node.attrib.get(r, None); + if value is None: + break; + if node.attrib[r] != v: + xpath = XPath(__search_node_name(r, xpaths) + "_" + r); + xpath.path = node1path if len(xpaths) % 2 == 0 else get_tree(i).getpath(node); + xpath.path += '/@' + r; + xpath.sample = v; + xpaths.append(xpath); + break; + return is_child_contain_info; + + +def get_diff_nodes(tree, root, root_path, has_attr, exists=None): + xpaths = []; + nodes = [r for r in root.xpath(root_path)] + count = len(nodes); + if count > 1: + __get_diff_nodes(tree, nodes, xpaths, has_attr); + if exists is not None: + for r in exists: + for p in xpaths: + short_path = xpath_take_off(p.path, root_path); + if r.path == to_str(short_path): + p.name = r.name; + break; + return xpaths; + + +def __is_same_string(t1, t2): + if t1 is None and t2 is None: + return True; + elif t1 is not None and t2 is not None: + return t1.strip() == t2.strip(); + return False + + +def __search_node_name(node, xpaths): + if not hasattr(node, 'attrib'): + return 'col%s' % (len(xpaths)) + attr_key = ["class", "id"] + for key in attr_key: + name = node.attrib.get(key, None) + if name is not None: + break; + if name is None or name == '': + return 'col%s' % (len(xpaths)) + for c in xpaths: + if c.name == name: + name2 = node.getparent().attrib.get(name, None) + if name2 is None: + return 'col%s' % (len(xpaths)) + else: + name = name2 + '_' + name + return name.replace(' ', '_') + + +def search_properties(root, exist_xpaths=None, is_attr=False): + if exist_xpaths == None: exist_xpaths = []; + tree = etree.ElementTree(root) + exist_len = len(exist_xpaths) + if exist_len > 1: + root_path = get_common_xpath(exist_xpaths); + yield root_path, get_diff_nodes(tree, root, root_path, is_attr, exist_xpaths); + elif exist_len == 1: + real_path = exist_xpaths[0]; + path_dict = {}; + for r in xpath_iter_sub(real_path.path): + __search_table_root(tree, root.xpath(str(r)), path_dict, False, strict=False); + max_p = 0; + path = None; + for r in path_dict: + if path_dict[r] > max_p: + path = r; + max_p = path_dict[r]; + if path is not None: + items = get_diff_nodes(tree, root, path, is_attr, exist_xpaths); + if len(items) > 1: + yield path, items + else: + path_dict = {}; + __search_table_root(tree, [root], path_dict, True); + path_dict = sorted(path_dict, key=lambda d: path_dict[d], reverse=True); + for root_path in path_dict: + items = get_diff_nodes(tree, root, root_path, is_attr, exist_xpaths); + if len(items) > 1: + yield root_path, items; + yield None, None + + +def _get_etree(html): + root = None + if html != '': + try: + root = etree.HTML(html); + except Exception as e: + sys.stderr.write('html script error' + str(e)) + return root; + + +def get_list(html, xpaths=None, has_attr=False): + root = _get_etree(html); + if xpaths is None: + root_path, xpaths = search_properties(root, None, has_attr); + datas = get_datas(root, xpaths, True, None) + return datas, xpaths; + + +def get_main(html): + if is_str(html): + html = _get_etree(html); + tree = etree.ElementTree(html); + node = search_text_root(tree, html); + return node + + +def get_sub_xpath(root, xpath): + paths = xpath.split('/'); + path = '/' + '/'.join(paths[len(root.split('/')):len(paths)]); + return path; + + +def get_datas(root, xpaths, multi=True, root_path=None): + tree = etree.ElementTree(root); + docs = []; + if not multi: + doc = {}; + for r in xpaths: + data = get_xpath_data(tree, r.path, r.is_html); + if data is not None: + doc[r.name] = data; + else: + doc[r.name] = ""; + return doc; + else: + if is_none(root_path): + root_path2 = get_common_xpath(xpaths); + else: + root_path2 = root_path; + nodes = tree.xpath(root_path2) + if nodes is not None: + for node in nodes: + doc = {}; + for r in xpaths: + path = r.path; + if is_none(root_path): + paths = r.path.split('/'); + path = '/'.join(paths[len(root_path2.split('/')):len(paths)]); + else: + path = tree.getpath(node) + path; + if path == '': + path = '/' + data = get_xpath_data(node, path, r.is_html); + if data is not None: + doc[r.name] = data; + if len(doc) == 0: + continue; + docs.append(doc); + return docs; diff --git a/etlpy/tools.py b/etlpy/tools.py new file mode 100644 index 0000000..0788715 --- /dev/null +++ b/etlpy/tools.py @@ -0,0 +1,2027 @@ +# coding=utf-8 +import json +import os +import urllib + +from dill import dill +import codecs +CACHE_DB_NAME = '_etlpy_cache.db' +ALLOW_CACHE = True + +from etlpy.extends import * +from etlpy.params import request_param, ExpParam +from etlpy.pickledb import pickledb +from etlpy.spider import get_sub_xpath, get_diff_nodes, _get_etree, search_xpath, search_properties, get_datas, \ + get_node_html, get_main, get_node_text +from etlpy.multi_yielder import multi_yield, NORMAL_MODE, NETWORK_MODE, DEFAULT_WORKER_NUM + +if PY2: + pass +else: + import html + +MERGE_APPEND = '+' +MERGE_CROSS = '*' +MERGE_MERGE = '|' +MERGE_MIX = 'mix' + + +def __get_match_counts(mat): + return mat.lastindex if mat.lastindex is not None else 1 + + +class ETLTool(EObject): + ''' + base class for all tool + ''' + + def __init__(self): + super(ETLTool, self).__init__() + self.p = '' + + def process(self, data, env): + return data + + def init(self): + pass + + def get_p(self, data): + return query(data, self.p) + + def _is_mode(self, mode): + if not hasattr(self, 'mode'): + return False + return mode in self.mode.split('|') + + def _eval_script(self, p, global_para=None, local_para=None): + if p == '': + return True + if not is_str(p): + return p(self) + result = None + from datetime import datetime + from time import mktime + def get_time(mtime): + from pytz import utc + if mtime == '': + mtime = datetime.now() + ts = mktime(utc.localize(mtime).utctimetuple()) + return int(ts) + + try: + if global_para is not None: + result = eval(p, global_para, locals()) + else: + result = eval(p) + except Exception as e: + p_expt(e) + return result + + def __str__(self): + return get_type_name(self) + '\t' + to_str(self.p) + + +class Transformer(ETLTool): + ''' + base class for all transformer + ''' + + def __init__(self): + super(Transformer, self).__init__() + self.one_input = False + self._m_process = False + + def transform(self, data): + pass + + def m_process(self, data, column): + for r in data: + yield r + + def _process(self, data, column, transform_func): + def edit_data(col, n_col=None): + n_col = n_col if n_col != '' and n_col is not None else col + if col != '' and not has(data, col) and (not isinstance(self, (SetTF, MapTF))): + return data + if self.one_input: + try: + res = transform_func(get_value(data, col)) + return set_value(data, n_col, res) + except Exception as e: + p_expt(e) + else: + n_col = n_col if n_col != '' and n_col is not None else col + if col != '' and col not in data and isinstance(col, (SetTF,)): + return + try: + transform_func(data, col, n_col) + return data + except Exception as e: + p_expt(e) + + if is_str(column): + if column.find(u':') >= 0: + column = para_to_dict(column, ' ', ':') + elif column.find(' ') > 0: + column = [r.strip() for r in column.split(' ')] + if isinstance(column, dict): + for k, v in column.items(): + edit_data(k, v) + elif isinstance(column, (list, set)): + for k in column: + edit_data(k) + else: + return edit_data(column, None) + return data + + def process(self, generator, env): + column = env['column'] + if self._m_process == True: + for r in self.m_process(generator, column): + yield r + return + if generator is None: + return + for d in generator: + if d is None: + continue + if debug_level > 3: + logging.debug(d) + logging.debug('transformer:' + get_type_name(self)) + res = self._process(d, column, self.transform) + yield res + + +class Executor(ETLTool): + ''' + base class for all executor + ''' + + def __init__(self): + super(Executor, self).__init__() + self.debug = False + + def execute(self, data, column): + pass + + def process(self, data, env): + column = env['column'] + for r in data: + yield self.execute(r, column) + + def init(self): + pass + + +class Filter(ETLTool): + ''' + base class for all filter + ''' + + def __init__(self): + super(Filter, self).__init__() + self.reverse = False + self.stop_while = False + self.one_input = True + + def filter(self, data): + return True + + def process(self, data, env): + column = env['column'] + error = 0 + for r in data: + item = None + result = False + if self.one_input: + if column in r: + item = r[column] + result = self.filter(item) + if item is None and self.__class__ != NotNull: + continue + else: + item = r + result = self.filter(item, column) + + if result == True and self.reverse == False: + yield r + elif result == False and self.reverse == True: + yield r + else: + error += 1 + if self.stop_while == False: + continue + elif self.stop_while == True: + logging.info('stop iter \n') + break + elif isinstance(self.stop_while, int) and error >= self.stop_while: + logging.info('stop iter \n') + break + + +class Ascend(ETLTool): + ''' + ascend sorter + :param p: sort lambda function or str + ''' + + def __init__(self): + super(Ascend, self).__init__() + self._reverse = False + self.p = '_' + self.one_input = True + + def process(self, data, env): + column = env['column'] + all_data = [r for r in data] + p = 'value' if self.p == '' else self.p + + def sort_map(data): + import inspect + if column == '': + c = data + else: + c = data.get(column, None) + if is_str(p): + dic = merge({'_': c}, data) + result = self._eval_script(p, dic) + elif inspect.isfunction(p): + result = p(c) + return result + + all_data.sort(key=lambda x: sort_map(x), reverse=self._reverse) + for r in all_data: + yield r + + +class Descend(Ascend): + ''' + descend sorter + :param p: sort lambda function or str + ''' + + def __init__(self): + super(Descend, self).__init__() + self._reverse = True + + +class Generator(ETLTool): + def __init__(self): + super(Generator, self).__init__() + self.mode = MERGE_APPEND + self.pos = 0 + + def generate(self, data, env): + pass + + def process(self, generator, env): + column = env['column'] + if generator is None: + + return self.generate(None, env) + else: + p = self.mode + if p == MERGE_APPEND: + return append(generator, self.process(None, env)) + elif p == MERGE_MERGE: + return merge(generator, self.process(None, env)) + elif p == MERGE_CROSS: + return cross(generator, self.generate, env) + else: + return mix(generator, self.process(None, env)) + + +EXECUTE_INSERT = 'insert' +EXECUTE_SAVE = 'save' +EXECUTE_UPDATE = 'update' + + +class MongoDBConnector(EObject): + def __init__(self): + super(MongoDBConnector, self).__init__() + self.connect_str = '' + self.db = '' + + def init(self): + import pymongo + client = pymongo.MongoClient(self.connect_str) + self._db = client[self.db] + + +class DBBase(ETLTool): + def __init__(self): + super(DBBase, self).__init__() + self.table = '' + self.mode = EXECUTE_INSERT + + def get_table(self, data): + c = query(data, self.p) + t = query(data, self.table) + connector = self._proj.env[c] + connector.init() + table = connector._db[t] + return table + + +class DbEX(Executor, DBBase): + ''' + db writer and updater + :param p: env connector name + ''' + + def __init__(self): + super(DbEX, self).__init__() + + def init(self): + DBBase.init(self) + + def process(self, datas, env): + column = env['column'] + for data in datas: + table = self.get_table(data) + work = {EXECUTE_INSERT: lambda d: table.save(d), EXECUTE_UPDATE: lambda d: table.save(d)} + new_data = copy(data) + etype = query(data, self.mode) + work[etype](new_data) + yield data + + +class DbGE(Generator, DBBase): + ''' + db reader + :param p: env connector name + ''' + + def generate(self, data, env): + column = env['column'] + table = self.get_table(self.table) + for data in table.find(): + yield data + + +class JoinDBTF(Transformer, DBBase): + ''' + db join + :param p: env connector name + :param mapper: column value mapper + :param index: join index key + ''' + + def __init__(self): + super(JoinDBTF, self).__init__() + self.index = '' + self.mapper = '' + + def transform(self, data, col, ncol): + table = self.get_table(data) + if table is None: + buf = [] + else: + value = data[self.index] + mapper = query(data, self.mapper) + mapper = para_to_dict(mapper, ' ', ':') + + def db_filter(d): + if '_id' in d: + del d['_id'] + + keys = {r: 1 for r in mapper.keys()} + result = table.find({self.index: value}, keys) + buf = [] + for r in result: + db_filter(r) + r = conv_dict(r, mapper) + buf.append(r) + data[ncol] = buf + + +class MatchFT(Filter): + ''' + filter that match keyword or regex + :param p: keyword or regex + :param count: min match count + ''' + + def __init__(self): + super(MatchFT, self).__init__() + self.regex = '' + + def init(self): + if self.regex != '': + self.regex = re.compile(self.p) + self.count = 1 + + def filter(self, data): + p = self.get_p(data) + if self.regex == '': + return data.find(p) >= 0 + else: + v = self.regex.findall(data) + if v is None: + return False + return self.count <= len(v) + + +class NotIn(Filter): + def __init__(self): + super(NotIn, self).__init__() + self.one_input = False + + def filter(self, data, column): + item = data[column] + p = self.get_p(data) + if isinstance(p, (list, tuple)): + return item not in p + elif is_str(item): + sp = to_str(p).split(':') + if len(sp) == 1: + p = query(data, p) + return p != item + else: + min = float(query(data, sp[0])) + max = float(query(data, sp[1])) + return min <= item <= max + else: + return False + + +class RepeatFT(Filter): + ''' + filter that key column repeated + ''' + + def __init__(self): + super(RepeatFT, self).__init__() + + def init(self): + self._set = set() + + def filter(self, data): + if data in self._set: + return False + else: + self._set.add(data) + return True + + +class NotNull(Filter): + ''' + filter that key column is empty or None + ''' + + def filter(self, data): + if data is None: + return False + if is_str(data): + return data.strip() != '' + return True + + +# class DebugTF(Transformer): +# def __init__(self): +# super(DebugTF, self).__init__() +# def process(self, generator, env): +# env['execute']=not self.p +# return super(DebugTF,self).process(generator,env) + + +class SetTF(Transformer): + ''' + set value + :param p: target value + ''' + + def __init__(self): + super(SetTF, self).__init__() + + def transform(self, data, col, ncol): + p = self.get_p(data) + data[col] = p + + +class LetTF(Transformer): + ''' + make stream target on certain column. + :param p: column name + ''' + + def __init__(self): + super(LetTF, self).__init__() + self.regex = '' + + def m_process(self, generator, env): + return generator + + +class CountTF(Transformer): + def __init__(self): + super(CountTF, self).__init__() + self._m_process = True + + def m_process(self, generator, column): + count = 0 + for data in generator: + yield data + count += 1 + if count % self.p == 0: + print(count) + + +class IncrTF(Transformer): + ''' + add a auto increase value + :param p: useless + ''' + + def init(self): + super(IncrTF, self).__init__() + self._index = 0 + + def transform(self, data): + self._index += 1 + return self._index + + +class TagTF(Transformer): + ''' + add a auto increase value + :param p: useless + ''' + pass + + +class CopyTF(Transformer): + ''' + copy one or more columns to other columns, then target column will move, diff from cp2. (cp short for copy) + :param p: like 'a:b c:d' means copy a to b, and copy c to d + if have target column a, can short as ':b', equal as 'a:b' + ''' + + def __init__(self): + super(CopyTF, self).__init__() + + def transform(self, data, col, ncol): + v = get_value(data, col) + set_value(data, ncol, v) + + +class Copy2TF(CopyTF): + ''' + copy one or more columns to other columns, then target column will not move, diff from cp + :param p: like 'a:b c:d' means copy a to b, and copy c to d + if have target column a, can short as ':b', equal as 'a:b' + ''' + pass + + +class Copy3TF(CopyTF): + ''' + copy one or more columns to other columns, then target column will not move, diff from cp + :param p: like 'a:b c:d' means copy a to b, and copy c to d + if have target column a, can short as ':b', equal as 'a:b' + ''' + pass + + +class MoveTF(Transformer): + ''' + move one or more columns to other columns, mv short for move + :param p: like 'a:b c:d' means move a to b, and copy c to d + ''' + + def transform(self, data, col, ncol): + if col != ncol: + set_value(data, ncol, get_value(data, col)) + del_value(data, col) + + +class RemoveTF(Transformer): + ''' + delete certain columns, rm short for move + :param p: like 'a b c', will delete column a,b,c + ''' + + def __init__(self): + super(RemoveTF, self).__init__() + self.regex = '' + + def transform(self, data, col, ncol): + del_value(data, col) + + +class KeepTF(Transformer): + ''' + keep certain columns, delete all other columns + :param p: like 'a b c', will keep column a,b,c + ''' + + def __init__(self): + super(KeepTF, self).__init__() + self._m_process = True + self.regex = '' + + def m_process(self, datas, col): + if isinstance(col, dict): + for data in datas: + doc = {} + for k, v in data.items(): + if k in col: + doc[col[k]] = v + yield doc + else: + for data in datas: + doc = {} + for k, v in data.items(): + if k in col: + doc[k] = v + yield doc + + +class EscapeTF(Transformer): + ''' + escape string + :param p: , mode can be 'html','url' or 'text', default is 'url' + ''' + + def __init__(self): + super(EscapeTF, self).__init__() + self.one_input = True + self.p = 'html' + + def transform(self, data): + p = self.get_p(data) + if p == 'html': + if PY2: + import cgi + return cgi.escape(data) + else: + return html.escape(data) + elif p == 'url': + url = data.encode('utf-8') + return urllib.parse.quote(url) + return data + + +class CleanTF(Transformer): + ''' + un_escape or clean string + :param p: mode can be 'html','url' or 'text', default is 'html' + ''' + + def __init__(self): + super(CleanTF, self).__init__() + self.one_input = True + self.p = 'text' + + def transform(self, data): + p = self.get_p(data) + if p == 'text': + if PY2: + return data.encode('utf-8').decode('string_escape').decode('utf-8').replace('\'', '\"') + else: + return data.decode('unicode_escape') + elif p == 'html': + if PY2: + import HTMLParser + html_parser = HTMLParser.HTMLParser() + return html_parser.unescape(data) + else: + return html.unescape(data) + elif p == 'url': + return urllib.parse.unquote(data) + return data + + +class FormatTF(Transformer): + ''' + format string,like python format + :param p: like 'ab{0}c{column_name}def' ,'{0}' represent current column + ''' + + def __init__(self): + super(FormatTF, self).__init__() + self.p = '{0}' + self._re = re.compile('\{([\w_]+)\}') + + def transform(self, data, col, ncol=None): + input = self.p + if col not in [None, '']: + dic = merge({'_': data[col]}, data) + else: + dic = data + output = input.format(**dic) + data[ncol] = output + + +class HtmlTF(Transformer): + ''' + get html text from a node, if not node, will not change it + ''' + + def __init__(self): + super(HtmlTF, self).__init__() + self.one_input = True + + def transform(self, data): + from pyquery import PyQuery + if isinstance(data, PyQuery): + return data.html() + res = get_node_html(data) + return res + + +class TextTF(Transformer): + ''' + get html text from a node, if not node, will not change it + ''' + + def __init__(self): + super(TextTF, self).__init__() + self.one_input = True + + def transform(self, data): + from pyquery import PyQuery + if isinstance(data, PyQuery): + return '\n'.join(get_node_text(r) for r in data) + res = get_node_text(data) + if res == '': + return to_str(data) + return res + + +class RegexTF(Transformer): + ''' + get regex match results from string into target array + ''' + + def __init__(self): + super(RegexTF, self).__init__() + self.one_input = True + + def m_str(self, data): + if is_str(data): + return data + elif isinstance(data, (list, tuple)): + return ''.join(data) + return data + + def _conv(self, x): + return x + + def transform(self, data): + regex = re.compile(self.p) + items = re.findall(regex, to_str(data)) + return [self._conv(self.m_str(r)) for r in items] + + +class LastTF(Transformer): + ''' + get last row from stream + ''' + + def __init__(self): + super(LastTF, self).__init__() + self.count = 1 + self._m_process = True + + def m_process(self, data, column): + r0 = None + while True: + try: + # 获得下一个值: + x = next(data) + r0 = x + except StopIteration: + # 遇到StopIteration就退出循环 + break + yield r0 + + +class AggTF(Transformer): + ''' + aggregate current row with next row + :param p: aggregate function, can be lambda, function or string, parameter is a,b + ''' + + def __init__(self): + super(AggTF, self).__init__() + self._m_process = True + + def m_process(self, data, column): + p = self.get_p(data) + r0 = None + import inspect + for m in data: + if column != '': + r = m[column] + if r0 == None: + r0 = r + yield m + continue + if is_str(p): + r2 = self._eval_script(p, global_para={'a': r0, 'b': r}) + elif inspect.isfunction(p): + r2 = p(r0, r) + if r2 != None: + r = r2 + if column != '': + m[column] = r + else: + m = r + yield m + r0 = r + + +class ReplaceTF(RegexTF): + ''' + replace string + :param p: match str or regex + :param mode: 'str' or 're' + :param value: new value + ''' + + def __init__(self): + super(ReplaceTF, self).__init__() + self.value = '' + self.mode = 'str' + self.one_input = False + + def transform(self, data, col, ncol): + ndata = data[col] + if ndata is None: + return + new_value = query(data, self.value) + p = self.get_p(data) + if self.mode == 're': + result = re.sub(p, new_value, ndata) + else: + if ndata is None: + result = None + else: + result = ndata.replace(p, new_value) + data[ncol] = result + + +class NumTF(RegexTF): + ''' + get number from str + ''' + + def __init__(self): + super(NumTF, self).__init__() + # self.p='(-?\d+)(\.\dt+)?' + self.p = '(-?\\d+)(\\.\\d+)?' + + def _conv(self, x): + try: + return int(x) + except Exception as e: + return float(x) + + +class SplitTF(Transformer): + ''' + split value with certain chars + :param p: 'a b': split string with a,b, return str array + ''' + + def __init__(self): + super(SplitTF, self).__init__() + self.one_input = True + + def transform(self, data): + return re.split(self.p, data) + + +class IntoTF(Transformer): + def transform(self, data, col, ncol): + v = data[col] + p = self.get_p(data) + items = replace_paras(split(p, ' '), col) + if isinstance(v, list): + for i in range(min(len(v), len(items))): + data[items[i]] = v[i] + + +class StripTF(Transformer): + ''' + strip string with certain char + :param p: char + ''' + + def __init__(self): + super(StripTF, self).__init__() + self.one_input = True + + def transform(self, data): + if data is None: + return None + p = self.get_p(data) + if p == '': + return data.strip() + else: + return data.strip(p) + + +class ExtractTF(Transformer): + ''' + get substring starts with 'start' and ends with 'end' from string + :param p: start string + :param end: end string + :param has_margin: bool, if contain start and end + ''' + + def __init__(self): + super(ExtractTF, self).__init__() + self.has_margin = False + self.one_input = True + self.end = '' + + def transform(self, data): + start = data.find(self.p) + if start == -1: + return + end = data.find(self.end, start) + if end == -1: + return + if self.has_margin: + end += len(self.end) + if not self.has_margin: + start += len(self.p) + return data[start:end] + + +class MapTF(Transformer): + def __init__(self): + super(MapTF, self).__init__() + self.p = 'script' + + def _get_data(self, data, col): + p = self.get_p(data) + if col == '': + if is_str(p): + dic = merge({'_': data}, data) + self._eval_script(p, dic) + else: + p(data) + return None + else: + + value = data[col] + if is_str(p): + dic = merge({'_': value}, data) + result = self._eval_script(p, dic) + else: + result = p(value) + return result + + def transform(self, data, col, ncol): + js = self._get_data(data, col) + if ncol != '': + data[ncol] = js + + +class Create(Generator): + def __init__(self): + super(Create, self).__init__() + self.p = 1 + + def can_dump(self): + return is_str(self.p) + + def generate(self, data, env): + column = env['column'] + from pandas import DataFrame + p = self.get_p(data) + import inspect + import copy + if is_str(p): + if p == '': + result = [{}] + else: + result = self._eval_script(p) + elif isinstance(p, int): + result = ({} for i in range(p)) + elif inspect.isfunction(p): + result = p() + + elif isinstance(p, DataFrame): + result = (row.to_dict() for l, row in p.iterrows()) + else: + result = p + for r in result: + if column != '': + yield {column: r} + else: + yield copy.copy(r) + + +class Where(Filter): + def __init__(self): + super(Where, self).__init__() + self.p = 'True' + self.one_input = False + + def can_dump(self): + return is_str(self.p) + + def filter(self, data, column): + p = self.get_p(data) + import inspect + data = copy(data) + if column == '': + value = data + else: + value = data[column] if column in data else '' + if is_str(p): + dic = merge({'_': data}, data) + result = self._eval_script(p, dic) + elif inspect.isfunction(p): + result = p(value) + if result == None: + return False + return result + + +class DetectTF(Transformer): + def __init__(self): + super(DetectTF, self).__init__() + self.attr = False + self.index = 0 + self._m_process = True + + def m_process(self, datas, column): + from lxml import etree + is_first = True + for data in datas: + p = self.get_p(data) + root = data[column] + if root is None: + return + if is_str(root): + root = _get_etree(root) + tree = etree.ElementTree(root) + if p != '': + xpaths = get_diff_nodes(tree, root, self.p, self.attr) + root_path = p + else: + result = first_or_default(get_mount(search_properties(root, None, self.attr), take=1, skip=self.index)) + + if result is None: + print('great hand failed') + return + root_path, xpaths = result + for r in xpaths: + r.path = get_sub_xpath(root_path, r.path) + + code0 = '\n.'.join((u"cp('{ncol}:{col}').xpath('/{path}')[0].text()\\".format(col=r.name, ncol=column, + path=r.path, sample=r.sample) + for r in xpaths)) + if is_first: + code = u".xpath('%s').list().html().tree()\\\n.%s" % (root_path, code0) + print(code) + code2 = '\n'.join( + u"#{key} : #{value}".format(key=r.name, value=r.sample.strip() if r.sample is not None else '') for + r in xpaths) + print(code2) + is_first = False + result = get_datas(root, xpaths, True, root_path=root_path) + for r in result: + yield r + + +class CacheTF(Transformer): + def __init__(self): + super(CacheTF, self).__init__() + self._m_process = True + + def m_process(self, datas, column): + i = 0 + cache = self.p + if cache is None: + for data in datas: + yield data + return + while i < len(cache): + yield cache[i] + i += 1 + del cache[:] + for r in datas: + cache.append(r) + yield r + + +class WebTF(Transformer): + def __init__(self): + super(WebTF, self).__init__() + self.encoding = 'utf-8' + self._mode = 'get' + self.header = '' + + def transform(self, data, col, ncol): + import requests + from etlpy.spider import get_encoding + if self.p in [None, '']: + req = request_param.copy() + key = 'data' if self._mode == 'post' else 'url' + req[key] = ExpParam('[_]') + else: + req = self.p + + result = None + real_req = req.get(data, col) + if 'url' in real_req: + url =str(real_req['url']) + if not url.startswith('http'): + url = 'http://'+url + real_req['url'] = url + cache = self._proj.cache + if cache is not None: + key = real_req.get('url', '') + real_req.get('data', '') + result = cache.get(key, None) + if result is None: + try: + + if self._mode == 'get': + r = requests.get(**real_req) + else: + r = requests.post(**real_req) + r.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常 + if r is not None and r.status_code == 200: + result = get_encoding(r.content) + if cache is not None and cache.size() < 1000: + cache.set(key, result) + except requests.RequestException as e: + p_expt(e) + + data[ncol] = result + + +class GetTF(WebTF): + def __init__(self): + super(GetTF, self).__init__() + self._mode = 'get' + + +class PostTF(WebTF): + def __init__(self): + super(PostTF, self).__init__() + self._mode = 'post' + + +class TreeTF(Transformer): + def __init__(self): + super(TreeTF, self).__init__() + self.one_input = True + self.smart = False + + def transform(self, data): + from lxml.etree import iselement + if not iselement(data): + root = _get_etree(data) + else: + root = data + if self.smart: + return get_main(root) + return root + + +class SearchTF(Transformer): + def __init__(self): + super(SearchTF, self).__init__() + self.one_input = True + self.mode = 'str' + + def transform(self, data): + if data is None: + return None + if is_str(data): + from lxml import etree + tree = _get_etree(data) + result = search_xpath(tree, self.p, self.mode, True) + print(result) + return result + + +class ListTF(Transformer): + def __init__(self): + super(ListTF, self).__init__() + self._m_process = True + + def m_process(self, generator, col): + for data in generator: + if data is None or col not in data: + if debug_level > 0: + logging.warn('data empty') + continue + root = data[col] + if self.p == '*': + p = list(data.keys()) + p.remove(col) + else: + p = self.get_p(data) + for r in root: + r2 = {col: r} + my = merge_query(r2, data, p) + yield my + + +class XPathTF(Transformer): + def _trans(self, data): + from lxml import etree + root = None + if is_str(data): + root = _get_etree(data) + tree = etree.ElementTree(root) + else: + tree = data + return tree, root + + def transform(self, data, col, ncol): + target = data[col] + tree, root = self._trans(target) + if tree is None: + return + node_path = query(data, self.p) + if node_path is None or node_path == '': + nodes = [target] + else: + nodes = tree.xpath(self.p) + if nodes is None: + nodes = [target] + data[ncol] = nodes + + +class StopTF(Transformer): + def __init__(self): + super(Transformer, self).__init__() + + +class PyQTF(XPathTF): + def __init__(self): + super(PyQTF, self).__init__() + + def transform(self, data, col, ncol): + from pyquery import PyQuery as pyq + root = pyq(data[col]) + if root is None: + return + node_path = self.get_p(data) + if node_path == '' or node_path is None: + return + nodes = root(node_path) + data[ncol] = nodes if nodes is not None else [] + + +class AtTF(Transformer): + def __init__(self): + super(AtTF, self).__init__() + self.one_input = True + + def transform(self, data): + p = self.get_p(data) + if isinstance(self.p, slice): + return data[p] + elif isinstance(data, (list, tuple)): + p = get_num(p, 0) + if len(data) <= p: + return None + return data[p] + elif data is None: + return None + else: + return data.get(p, None) + + +class ParallelTF(Transformer): + def __init__(self): + super(ParallelTF, self).__init__() + self.p = NORMAL_MODE + self.worker_num = 1 + + + +class DumpTF(Transformer): + def __init__(self): + super(DumpTF, self).__init__() + self.one_input = True + self.p = 'json' + + def transform(self, data): + p = self.get_p(data) + if p == 'json': + return json.dumps(data) + + elif p == 'yaml': + import yaml + return yaml.dumps(data) + elif p == 'html': + return get_node_html(data) + else: + return to_str(data) + + +class LoadTF(Transformer): + def __init__(self): + super(LoadTF, self).__init__() + self.one_input = True + self.p = 'json' + + def transform(self, data): + p = self.p + if p == 'json': + return json.loads(data) + elif p == 'yaml': + import yaml + return yaml.loads(data) + elif p == 'html': + return _get_etree(data) + + +class Range(Generator): + def __init__(self): + super(Range, self).__init__() + self.p = ':' + + def generate(self, data, env): + column = env['column'] + p = self.get_p(data) + start = 0 + interval = 1 + if is_str(p): + start, end, interval = get_range(p, data) + elif isinstance(p, int): + end = p + elif isinstance(p, (tuple, list)): + l = len(p) + if l > 0: + start = p[0] + if l > 1: + end = p[1] + if l > 2: + interval = p[2] + if start == end: + yield {column: start} + + return + values=[] + try: + values = range(start, end, interval) + except Exception as e: + pass + for i in values: + item = {column: i} + yield item + + +class SubBase(ETLTool): + def __init__(self): + super(SubBase, self).__init__() + self.range = '0:100' + + def _get_task(self, data): + p = self.get_p(data) + if isinstance(p, ETLTask): + return p + if p not in self._proj.env: + sys.stderr.write('sub task %s not in current project' % p) + sub_etl = self._proj.env[p] + return sub_etl + + def _get_tools(self, data): + sub_etl = self._get_task(data) + start, end, interval = get_range(self.range) + tools = tools_filter(sub_etl.tools[start:end:interval], excluded=self, mode=NORMAL_MODE) + return tools + + def _generate(self, data, env): + doc = copy(data) + generator = [doc] if doc is not None else None + for r in ex_generate(self._get_tools(doc), generator, env): + yield r + + +class SubGE(Generator, SubBase): + def __init__(self): + super(SubGE, self).__init__() + + def generate(self, data, env): + return self._generate(data, env) + + +class SubEx(Executor, SubBase): + def __init__(self): + super(SubEx, self).__init__() + + def process(self, data, env): + for d in data: + if self.debug == True: + for r in self._generate(d, env): + yield r + else: + yield self.execute(d, env) + + def execute(self, data, env): + count = 0 + try: + for r in self._generate(data, env): + count += 1 + except Exception as e: + p_expt(e) + print('subtask:' + to_str(count)) + return data + + +class SubTF(Transformer, SubBase): + def __init__(self): + super(SubTF, self).__init__() + + def transform(self, data, col, ncol): + env = {'column': col} + data[ncol] = (r for r in self._generate(data, env)) + + +class RotateTF(Transformer): + ''' + rotate matrix, not lazy + ''' + + def __init__(self): + super(RotateTF, self).__init__() + self._m_process = True + + def m_process(self, datas, column): + result = {} + for data in datas: + p = self.get_p(data) + key = data.get(column, None) + if key is None: + continue + value = query(data, p) + result[key] = value + yield result + + +class ToDictTF(Transformer): + ''' + take certain columns merge into dict + :param p: columns, split by blanket + ''' + + def __init__(self): + super(ToDictTF, self).__init__() + self._m_process = True + + def m_process(self, datas, col): + for data in datas: + doc = {} + merge_query(doc, data, self.p) + data[col] = doc + yield data + + +class DrillTF(Transformer): + ''' + take dict value out + :param p: dict columns, split by blanket. all column will be added if value is '' + ''' + + def __init__(self): + super(DrillTF, self).__init__() + self._m_process = True + + def m_process(self, datas, col): + for data in datas: + doc = copy(data) + col_data = data[col] + if self.p != '': + merge_query(doc, col_data, self.p) + else: + merge(doc, col_data) + yield doc + + +class TakeTF(Transformer): + ''' + take top n rows + :param p: n + ''' + + def __init__(self): + super(TakeTF, self).__init__() + + def process(self, data, env): + column = env['column'] + p = get_num(self.get_p(data)) + for r in get_mount(data, p): + yield r + + +class SkipTF(Transformer): + ''' + skip n rows + :param p: n + ''' + + def __init__(self): + super(SkipTF, self).__init__() + + def process(self, data, column): + p = get_num(self.get_p(data)) + for r in get_mount(data, None, p): + yield r + + +class DelayTF(Transformer): + ''' + delay some time + :param p: delay n millisecond + ''' + + def __init__(self): + super(DelayTF, self).__init__() + self.p = 100 + self._m_process = True + + def m_process(self, datas, col): + import time + for data in datas: + p = get_num(self.get_p(data), default=0) + time.sleep(float(p) / 1000.0) + yield data + + +class Read(Generator): + pass + + +class Download(Executor): + ''' + download a file from web + :param p: target url + :param path: save path on disk + ''' + + def __init__(self): + super(Download, self).__init__() + self.path = '' + + def execute(self, data, column): + import requests + p = self.get_p(data) + (folder, file) = os.path.split(p) + if not os.path.exists(folder): + os.makedirs(folder) + url = data[column] + target = open(p, 'wb') + req = requests.get(url) + if req is None: + return + target.write(req.content) + target.close() + return data + + +class Project(EObject): + ''' + project that contains all tasks + ''' + + def __init__(self): + self.env = {} + self.desc = "edit project description here" + if ALLOW_CACHE: + self.cache = pickledb(CACHE_DB_NAME, False) + else: + self.cache = None + + def clear(self): + ''' + clear all tasks in project + :return: self + ''' + if self.cache is not None: + self.cache.deldb() + self.env.clear() + return self + + def dumps_json(self): + dic = convert_dict(self) + return json.dumps(dic, ensure_ascii=False, indent=2) + + def dumps_yaml(self): + import yaml + dic = convert_dict(self) + return yaml.dump(dic) + + def dump_yaml(self, path): + with open(path, 'w', encoding='utf-8') as f: + f.write(self.dumps_yaml()) + + def load_yaml(self, path): + import yaml + with open(path, 'r', encoding='utf-8') as f: + d = yaml.load(f) + return self.load_dict(d) + + def loads_json(self, js): + d = json.loads(js) + return self.load_dict(d) + + def dump_json(self, path): + with open(path, 'w', encoding='utf-8') as f: + f.write(self.dumps_json()) + + def load_json(self, path): + with open(path, 'r', encoding='utf-8') as f: + js = f.read() + return self.loads_json(js) + + def load_dict(self, dic): + items = dic.get('env', {}) + for key, item in items.items(): + if 'Type' in item: + item['Type'] = item['Type'].replace('tools.', '') + obj_type = item['Type'].replace('tools.', '') + task = eval('%s()' % obj_type) + if obj_type == 'ETLTask': + for r in item['tools']: + r['Type'] = r['Type'].replace('tools.', '') + etl = eval('%s()' % r['Type']) + for attr, value in r.items(): + if attr in ['Type']: + continue + setattr(etl, attr, value) + etl._proj = self + task.tools.append(etl) + else: + dict_copy_poco(task, item) + else: + task = item + self.env[key] = task + return self + + +def ex_generate(tools, generator=None, env=None): + if env is None: + env = {'column': '', 'execute': False} + start = 0 + while True: + take_index = get_index(tools[start:], lambda x: isinstance(x, TakeTF)) + if take_index == -1: + break + else: + generator = _ex_generate(tools[start:start + take_index], generator, env) + start += take_index + 1 + pos = 0 if start == 0 else start - 1 + generator = _ex_generate(tools[pos:], generator, env) + for item in generator: + yield item + + +def _ex_generate(tools, generator=None, env=None): + mapper, reducer, pl = parallel_map(tools, env) + if pl is None: + group, mode, worker_num = 1, None, 1 + else: + group, mode, worker_num = 1, pl.p, pl.worker_num + #TODO: 对group的考虑 + generator = generate(mapper, generator, env) + if reducer is None: + return generator + # group_generator = group_by_mount(generator, group) + generator = multi_yield(lambda task: _ex_generate(reducer, task, env), mode, worker_num, generator) + return generator + + +def tools_filter(tools, init=True, excluded=None, mode=NORMAL_MODE): + stop_index = get_index(tools, lambda x: isinstance(x, StopTF)) + if stop_index != -1: + tools = tools[:stop_index] + + buf = [] + if not isinstance(mode, (list, tuple)): + mode = [i for i in range(mode, -1, -1)] + else: + mode.sort(reverse=False) + if mode[-1] != NORMAL_MODE: + mode.append(NORMAL_MODE) + index = 0 + for tool in tools: + if excluded == tool: + continue + if init: + tool.init() + buf.append(tool) + + buf2 = [] + total = len(buf) + # 若pl设置的级别比全局级别高,要降级 + # 若pl比全局级别低,则以该级别为准 + # 级别越往后走越低,没有同级的,最低到0 + + for i in range(total): + tool = buf[i] + allow_insert = False + buf2.append(tool) + if isinstance(tool, ListTF): + j = i + 1 + while j < total and not isinstance(buf[j], ParallelTF): + if isinstance(buf[j], ListTF): + allow_insert = True + break + j += 1 + pl = None + if j == total: + pass + elif allow_insert: + pl = ParallelTF() + buf2.append(pl) + elif isinstance(buf[j], ParallelTF): + pl = buf[j] + if pl is not None: + pl.mode = mode[index] + pl.worker_num = DEFAULT_WORKER_NUM if pl.p < NETWORK_MODE else 1 + if index < len(mode) - 1: + index += 1 + return buf2 + + +def get_column(tool, env): + next_column = old_column = column = env['column'] + p = tool.p + if is_str(p): + if p.find(u':') >= 0: + p = para_to_dict(p, ' ', ':') + + elif p.find(' ') > 0: + p = [r.strip() for r in p.split(' ')] + p = replace_paras(p, old_column) + if isinstance(tool, Copy3TF): + if env.get('keep', None) is None: + env['keep'] = old_column + column = {column: p} + else: + column = {env['keep']: p} + next_column = p + + elif isinstance(tool, (LetTF, RemoveTF, KeepTF)): + if tool.regex != '': + pass + column = p + env['keep'] = None + elif isinstance(tool, (CopyTF, MoveTF, Copy2TF)): + column = p + if isinstance(p, dict): + if not isinstance(tool, Copy2TF): + p2 = p.values() + else: + p2 = p.keys() + + next_column = ' '.join(p2) + env['keep'] = None + + env['next'] = next_column + env['column'] = column + return env + + +def generate(tools, generator=None, env=None): + ''' + evaluate a tool stream + :param tools: [ETLTool] + :param generator: seed generator for generator + :param init: bool, if initiate every tool + :param env: + :return: a generator + ''' + if tools is not None: + for tool in tools: + env = get_column(tool, env) + if isinstance(tool, LetTF): + continue + execute = env.get('execute', False) + if not execute and isinstance(tool, Executor): + continue + generator = tool.process(generator, env.copy()) + env['column'] = env['next'] + if generator is None: + return [] + return generator + + +def parallel_map(tools, env, certain_mode=None): + ''' + split tool into mapper and reducer + :param tools: a list for tool + :return: mapper, reducer and parallel parameter + ''' + + index = get_index(tools, lambda x: isinstance(x, ParallelTF) and (certain_mode is None or x.p == certain_mode)) + if index == -1: + return tools, None, None + + mapper = tools[:index] + reducer = tools[index + 1:] + parameter = tools[index] + return mapper, reducer, parameter + + +class ETLTask(EObject): + def __init__(self): + self.tools = [] + self._master = None + + def __len__(self): + return len(self.tools) + + def __tool_factory(self, tool_type, item): + tool = tool_type() + tool._proj = self._proj + tool.p = item + self.tools.append(tool) + return tool + + def __getattr__(self, item): + if item in self.__dict__: + return self.__dict__[item] + if item == '_': + item = '' + tool = self.__tool_factory(LetTF, item) + return self + + def __add__(self, task): + if isinstance(task, ETLTask): + for item in task.tools: + item._proj = self._proj + self.tools.append(item) + # TODO: other kind of op? + return self + + def __mul__(self, task): + tool = self.__tool_factory(SubGE, task) + tool.mode = MERGE_CROSS + return self + + def __or__(self, task): + tool = self.__tool_factory(SubGE, task) + tool.mode = MERGE_MERGE + return self + + def __getitem__(self, item): + self.__tool_factory(AtTF, item) + return self + + def to_json(self): + dic = convert_dict(self) + return json.dumps(dic, ensure_ascii=False, indent=2) + + def to_yaml(self): + import yaml + dic = convert_dict(self) + return yaml.dump(dic) + + def eval(self, script=''): + eval('self.' + script) + return self + + def check(self): + tools = to_list(tools_filter(self.tools)) + for i in range(1, len(tools)): + attr = EObject() + tool = tools[i] + title = get_type_name(tool).replace('etl.', '') + ' ' + tool.column + list_datas = to_list(progress_indicator(get_keys(ex_generate(tools[:i]), attr), title=title)) + keys = ','.join(attr.__dict__.keys()) + print('%s, %s, %s' % (str(i), title, keys)) + + def distribute(self, port=None, monitor_connector_name=None, table_name=None): + from etlpy.concurrence import Master + self._master = Master( + self._master.start_project(self._proj, self.name, port, monitor_connector_name, table_name)) + + def _get_related_tasks(self, tasks): + for r in self.tools: + if isinstance(r, SubBase) and r not in tasks: + tasks.add(r.name) + r._get_related_tasks(tasks) + + def get_related_tasks(self): + tasks = set() + self._get_related_tasks(tasks) + return tasks + + def pl_generator(self): + new_proj = dill.dumps(self._proj) + new_proj= codecs.encode(new_proj,'base64').decode() + # for k, v in new_proj.items(): + # if isinstance(v, dict) and v.get('Type', None) == 'ETLTask' and v['name'] not in related_tasks: + # del new_proj[k] + + def generator(): + env = {'column': '', 'execute': False} + mapper, reducer, parallel = parallel_map(self.tools, NETWORK_MODE) + if parallel is None: + print('this script do not support pl...') + return + for jobs in group_by_mount(generate(mapper, None, env), parallel.worker_num): + yield jobs, env + + id = 0 + for jobs, env in progress_indicator(generator()): + job = {'proj': new_proj, 'name': self.name, 'tasks': jobs, 'id': id, 'env': env} + yield job + id += 1 + + def rpc(self, method='finished', server='127.0.0.1', port=60007): + import requests + if method in ['finished', 'dispatched', 'clean']: + url = "http://%s:%s/task/query/%s" % (server, port, method) + data = requests.get(url).json() + print('remain: %s' % (data['remain'])) + #return collect(data[method], count=1000000) + elif method == 'insert': + url = "http://%s:%s/task/%s" % (server, port, method) + id = 0 + count=0 + for job in self.pl_generator(): + res = requests.post(url, json=job) + js= res.json() + print('task insert %s, %s' % (id,js)) + if js['status']=='success': + count+=1 + id += 1 + print('total push tasks: %s' % (count)) + + def stop_server(self): + if self._master is None: + return 'server is not exist' + self._master.manager.shutdown() + + def __str__(self): + def conv_value(value): + if is_str(value): + value = value.replace('\n', ' ').replace('\r', '') + sp = "'" + if value.find("'") >= 0: + if value.find('"') >= 0: + sp = "'''" + else: + sp = '"' + return "%s%s%s" % (sp, value, sp) + return value + + array = [] + array.append('##task name:%s' % self.name) + array.append('.clear()') + for t in self.tools: + typename = get_type_name(t) + s = ".%s(%s" % (typename, conv_value(get_value(t, 'column'))) + attrs = [] + default_dict = type(t)().__dict__ + for att in t.__dict__: + value = t.__dict__[att] + if att in ['one_input']: + continue + if not isinstance(value, (int, bool, float)) and not is_str(value): + continue + if value is None or att not in default_dict or default_dict[att] == value: + continue + attrs.append(',%s=%s' % (att.lower(), conv_value(value))) + if any(attrs): + s += ''.join(attrs) + s += ')\\' + array.append(s) + return '\n'.join(array) + + def init(self): + for tool in self.tools: + tool.init() + return self + + def todf(self): + from pandas import DataFrame + list_datas = to_list(self.query()) + return DataFrame(list_datas) + + def query(self, take=999, skip=0, mode=NORMAL_MODE): + tools = tools_filter(self.init().tools[skip:take], init=False, mode=mode) + for r in ex_generate(tools): + yield r + cache = self._proj.cache + if cache is not None: + cache.dump() + + def __iter__(self): + for r in self.query(): + yield r + + def __call__(self, count=None, format=''): + return self.collect(count=count, format=format) + + def debug(self, p): + pass + + def collect(self, format='', count=None, paras=None): + datas = get_mount(self.init().query(), take=count) + return collect(datas, format, paras=paras) diff --git a/extends.py b/extends.py deleted file mode 100644 index 45f5a6a..0000000 --- a/extends.py +++ /dev/null @@ -1,128 +0,0 @@ -# encoding: UTF-8 -import re; - -spacere = re.compile("[ ]{2,}"); -spacern = re.compile("(^\r\n?)|(\r\n?$)") - - -def getkeys(generator): - count=0; - s=set(); - for r in generator: - s=s|r.keys(); - count+=1; - if count>=20: - return list(s); - return list(s) - -def ReplaceLongSpace(txt): - r = spacere.subn(' ', txt)[0] - r = spacern.subn('', r)[0] - return r; - - -def Merge(d1, d2): - for r in d2: - d1[r] = d2[r]; - return d1; - - -def MergeQuery(d1, d2, columns): - if isinstance(columns, str) and columns.strip() != "": - columns = columns.split(' '); - for r in columns: - if r in d2: - d1[r] = d2[r]; - return d1; - - - - -def Query(data, key): - if data is None: - return key; - if isinstance(key, str) and key.startswith('[') and key.endswith(']'): - key = key[1:-1]; - return data[key]; - return key; - - - - - -def findany(iteral, func): - for r in iteral: - if func(r): - return True; - return False; - - -def getindex(iteral, func): - for r in range(len(iteral)): - if func(iteral[r]): - return r; - return -1; - -def Cross(a, genefunc): - - for r1 in a: - for r2 in genefunc(r1): - for key in r2: - r1[key] = r2[key] - yield r1; - - -def MergeAll(a, b): - while True: - t1 = a.__next__() - if t1 is None: - return; - t2 = b.__next__() - if t2 is not None: - for t in t2: - t1[t] = t2[t]; - yield t1; - - -def Append(a, b): - for r in a: - yield r; - for r in b: - yield r; - -def get_type_name(obj): - s=str(obj.__class__); - p=s.find('.'); - r= s[p+1:].split('\'')[0] - return r; - - -class EObject(object): - pass; - - - -def convert_to_builtin_type(obj): - d= { key:value for key,value in obj.__dict__.items() if isinstance(value,(str,int,float,list,dict,tuple,EObject) or value is None)}; - return d - -def dict_to_poco_type(obj): - if isinstance(obj,dict): - result= EObject(); - for key in obj: - v= obj[key] - setattr(result,key,dict_to_poco_type(v)) - return result - elif isinstance(obj,list): - for i in range(len(obj)): - obj[i]=dict_to_poco_type(obj[i]); - - return obj; - - -def dict_copy_poco(obj,dic): - for key,value in obj.__dict__.items(): - if key in dic: - if isinstance(dic[key], (str,int,float)): - - setattr(obj,key,dic[key]) diff --git a/project.xml b/project.xml deleted file mode 100644 index b693c76..0000000 --- a/project.xml +++ /dev/null @@ -1,105 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/sample.py b/sample.py deleted file mode 100644 index a887c57..0000000 --- a/sample.py +++ /dev/null @@ -1,20 +0,0 @@ -import etl; - -import extends -import time; -path='/home/desert.zym/dev' - -proj=etl.Project_LoadXml(path+'/Hawk-Projects/图片抓取/昵图网.xml'); -lagou=proj.modules['昵图网']; -tools= lagou.AllETLTools; -tools[-12].Format="/cloud/usr/desert.zym/picture/昵图网/{1}/{0}.jpg"; -tools[-1].Enabled=False; -tools[-9].Enabled=False; -#for r in lagou.QueryDatas(etlCount=19,execute=False): -# print(r) -# print(r) -from distributed import * -master =Master(proj,"昵图网"); -master.start(); - - diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5fd5d21 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +"""Setup file for aqara package.""" +from setuptools import setup, find_packages + +setup(name='etlpy', + version='1.0', + description='Super ETL tool', + url='https://github.com/ferventdesert/etlpy', + author='Yiming Zhao', + author_email= 'buptzym@qq.com', + license='MIT', + packages=['etlpy'], + keywords = ['spider', 'stream', 'dsl'], + install_requires=['lxml', 'pyquery','requests'] +) \ No newline at end of file diff --git a/spider.py b/spider.py deleted file mode 100644 index b850318..0000000 --- a/spider.py +++ /dev/null @@ -1,334 +0,0 @@ -# coding=utf-8 -import gzip -import re -import socket -import urllib.request -from lxml import etree -from urllib.parse import urlparse,urlunparse -import extends; -import http.cookiejar -from urllib.request import quote - -boxRegex = re.compile(r"\[\d{1,3}\]"); - - -class CrawItem(extends.EObject): - def __init__(self, name=None, sample=None, ismust=False, isHTMLorText=True, xpath=None): - self.XPath = xpath; - self.Sample = sample; - self.Name = name; - self.IsMust = ismust; - self.IsHTMLorText = isHTMLorText; - self.Children = []; - - def __str__(self): - return "%s %s %s" % (self.Name, self.XPath, self.Sample); - - -def RemoveFinalNum(paths): - v = paths[-1]; - m = boxRegex.search(v); - if m is None: - return paths; - s = m.group(0); - paths[-1] = v.replace(s, ""); - return paths; - - -def GetMaxCompareXPath(items): - xpaths = [r.XPath.split('/') for r in items]; - minlen = min(len(r) for r in xpaths); - c = None; - for i in range(minlen): - for index in range(len(xpaths)): - path = xpaths[index]; - if index == 0: - c = path[i]; - elif c != path[i]: - first = path[0:i + 1]; - return '/'.join(RemoveFinalNum(first)); - - -attrsplit=re.compile('@|\['); - -def GetDataFromXPath(node, path): - p = node.xpath(path); - if p is None: - return None; - if len(p) == 0: - return None; - paths = path.split('/'); - last = paths[-1]; - if last.find('@')>=0 and last.find('[1]')>=0: - return p[0]; - return getnodetext(p[0]); - - - - - - - - -def GetImage(addr, fname): - u = urllib.urlopen(addr) - data = u.read() - f = open(fname, 'wb') - f.write(data) - f.close() - - -def urlEncodeNonAscii(b): - return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) - -def iriToUri(iri): - parts= urlparse(iri) - - pp= [(parti,part) for parti, part in enumerate(parts)] - res=[]; - for p in pp: - res.append(p[1] if p[0] != 4 else quote(p[1] )) - - return urlunparse(res); - - - - -extract = re.compile('\[(\w+)\]'); - -charset = re.compile(r'content="text/html;.?charset=(.*?)"'); -class HTTPItem(extends.EObject): - def __init__(self): - self.Url = '' - self.Cookie = ''; - self.Headers = None; - self.Timeout = 30; - self.opener = ""; - self.postdata='' - - def PraseURL(self, url): - u = Para2Dict(urlparse(self.Url).query, '&', '='); - for r in extract.findall(url): - url = url.replace('[' + r + ']', u[r]) - return url; - - def GetHTML(self, destUrl=None): - if destUrl is None: - destUrl = self.Url; - destUrl = self.PraseURL(destUrl); - socket.setdefaulttimeout(self.Timeout); - cj = http.cookiejar.CookieJar() - pro = urllib.request.HTTPCookieProcessor(cj) - opener = urllib.request.build_opener(pro) - t = [(r, self.Headers[r]) for r in self.Headers]; - opener.addheaders = t; - binary_data = self.postdata.encode('utf-8') - try: - destUrl.encode('ascii') - except UnicodeEncodeError: - destUrl = iriToUri(destUrl) - - try: - if self.postdata=='': - page=opener.open(destUrl); - else: - page = opener.open(destUrl, binary_data) - html = page.read() - except Exception as e: - print(e); - return "" - - - if page.info().get('Content-Encoding') == 'gzip': - html = gzip.decompress(html) - encoding = charset.search(str(html)) - if encoding is not None: - encoding = encoding.group(1); - if encoding is None: - encoding = 'utf-8' - try: - html=html.decode(encoding) - except UnicodeDecodeError as e: - print(e); - import chardet - encoding= chardet.detect(html) - html=html.decode(encoding); - - return html; - - -# 解压函数 -def ungzip(data): - data = gzip.decompress(data) - return data; - -def IsNone(data): - return data is None or data==''; - -def __getnodetext__(node, arrs): - t=node.text; - if t is not None: - s = t.strip(); - if s != '': - arrs.append(s) - for sub in node.iterchildren(): - __getnodetext__(sub,arrs) - -def getnodetext(node): - if node is None: - return "" - arrs=[]; - __getnodetext__(node,arrs); - return ' '.join(arrs); - - -class SmartCrawler(extends.EObject): - def __init__(self): - self.IsMultiData = "List"; - self.HttpItem = None; - self.Name = None; - self.CrawItems = None; - self.Login = ""; - self.haslogin = False; - self.RootXPath='' - - def autologin(self, loginItem): - if loginItem.postdata is None: - return; - import http.cookiejar - cj = http.cookiejar.CookieJar() - pro = urllib.request.HTTPCookieProcessor(cj) - opener = urllib.request.build_opener(pro) - t = [(r, loginItem.Headers[r]) for r in loginItem.Headers]; - opener.addheaders = t; - binary_data = loginItem.postdata.encode('utf-8') - op = opener.open(loginItem.Url, binary_data) - data = op.read().decode('utf-8') - print(data) - self.HttpItem.Url = op.url; - return opener; - - def CrawData(self, url): - - if self.Login !="" and self.haslogin == False: - self.HttpItem.opener = self.autologin(self.Login); - self.haslogin = True; - html = self.HttpItem.GetHTML(url); - - root =None if html=='' else etree.HTML(html); - if root is None: - return {} if self.IsMultiData == 'One' else []; - - tree = etree.ElementTree(root); - if isinstance(self.CrawItems, list) and len(self.CrawItems) == 0: - return {'Content': html}; - - return self.GetDataFromCrawItems(tree ); - - def GetDataFromCrawItems(self,tree): - documents = []; - if self.IsMultiData =='One': - document = {}; - for r in self.CrawItems: - data = GetDataFromXPath(tree, r.XPath); - if data is not None: - document[r.Name] = data; - else: - document[r.Name] = ""; - return document; - else: - if not IsNone(self.RootXPath): - rootXPath = self.RootXPath; - else: - rootXPath = GetMaxCompareXPath(self.CrawItems); - nodes = tree.xpath(rootXPath) - if nodes is not None: - for node in nodes: - document = {}; - for r in self.CrawItems: - path=r.XPath; - if IsNone(self.RootXPath): - paths=r.XPath.split('/'); - path='/'.join(paths[len(rootXPath.split('/')):len(paths)]); - else: - path= tree.getpath(node)+ path; - data = GetDataFromXPath(node,path); - if data is not None: - document[r.Name] = data; - if len(document) == 0: - continue; - documents.append(document); - return documents; - -def Para2Dict(para, split1, split2): - r = {}; - for s in para.split(split1): - rs = s.split(split2); - if len(rs) < 2: - continue; - key = rs[0]; - value = s[len(key) + 1:]; - r[rs[0]] = value; - - return r; - - -def GetHTML(url, code=None): - url = url.strip(); - if not url.startswith('http'): - url = 'http://' + url; - print("auto transform %s" % (url)); - socket.setdefaulttimeout(30) - i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", - "Accept": "text/plain"} - req = urllib.request.Request(url=url, headers=i_headers) - page = urllib.request.urlopen(req) - html = page.read() - return html; - - -def GetHTMLFromFile(fname): - f = open(fname, 'r', 'utf-8'); - r = f.read(); - return r; - - -def GetCrawNode(craws, name, tree): - for r in craws: - if r.Name == name: - return tree.xpath(r.XPath); - return None; - - -def GetImageFormat(name): - if name is None: - return None, None; - p = name.split('.'); - if len(p) != 2: - return name, 'jpg'; - - back = p[-1]; - if back == "jpg" or back == "png" or back == "gif": # back=="png" ignore because png is so big! - return p[-2], back; - return None, None; - - -def GetCrawData(crawitems, tree): - doc = {}; - for crawItem in crawitems: - node = tree.xpath(crawItem.XPath); - if len(node) == 0: - if crawItem.IsMust: - return; - if crawItem.IsHTMLorText is False: - text = node[0].text; - else: - text = etree.tostring(node[0]); - doc[crawItem.Name] = text; - return doc; - - -def GetHtmlTree(html): - root = etree.HTML(html); - tree = etree.ElementTree(root); - return tree; diff --git a/test/car.py b/test/car.py new file mode 100644 index 0000000..beb377d --- /dev/null +++ b/test/car.py @@ -0,0 +1,64 @@ +# coding=utf-8 +import os +import sys + + +parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, parentdir) + +from etlpy.etlpy import * +import pandas as pd +from etlpy.multi_yielder import THREAD_MODE + +df = pd.read_excel('/Users/zhaoyiming/Documents/datasets/汽车/汽车大系.xlsx') +df=df.drop_duplicates(['id']) + +url = 'http://www.autohome.com.cn/' +url2 = 'http://reply.autohome.com.cn/api/comments/show.json?count=50&page={p}&id={pid}&appid=1&datatype=json' +url3 = 'https://club.m.autohome.com.cn/bbs/forum-c-{id}-{_}.html' + +t = task().create(df).keep('id 中文名') \ + + +news = \ +task().cp('id:page').format(url + '{_}/0/0-0-1-0/').get().xpath('//*[@id="maindiv"]/div[2]/div/div[2]/div/a[last()-1]')[ + 0].text() \ + .num()[0].let('p').range('1:[page]', mode='*').map(lambda x: x + 1).format(url + '{id}/0/0-0-{p}-0/').get() \ + .pyq('.cont-info > ul > li').list('中文名 id').html().tree() \ + .cp('p:标题').xpath('//h3/a')[0].text() \ + .cp('p:作者').xpath('//p[1]/span[1]/a')[0].text() \ + .cp('p:日期').xpath('//p[1]/span[2]')[0].text() \ + .cp('p:阅读').xpath('//p[1]/span[3]')[0].text() \ + .cp('p:评论').xpath('//p[1]/span[4]')[0].text() \ + .cp('p:标签').xpath('//p[3]/a')[0].text() \ + .cp('p:url').xpath('//h3/a/@href')[0].rm('p') + +news_content = task().cp('url:pid').re('\d{6,}')[-1].url.map(lambda x: 'http:' + x).replace('(\d{1,2})-all)?.html', + '.article-content').text() + +news_comment = task().let('p').set(1).cp('pid:page').format(url2).get().load('json')['commentcountall'].map( + lambda x: int(x / 50) + 1).let('p').range('1:[page]', mode='*') \ + .map(lambda x: x + 1).format(url2).get().load('json')['commentlist'].list().drill().rm('p') + + +forum = task().rm('中文名').let('page').set('1').format(url3).get().xpath('//*[@id="pagination"]/span[1]/input/@value')[0].text().num()[0]\ + .pl().let('p').range('1:[page]',mode='*').format(url3).get() \ + .pyq('.dataitem').list().html().tree() \ + .cp('p:标题').xpath('//a/div[1]/h4')[0].text() \ + .cp('p:信息').xpath('//@lang')[0].text() \ + .rm('p id').count(1000) + +t+= forum +with open('car_detail.txt','w',encoding='utf-8') as f: + for r in t.query(mode=[THREAD_MODE]): + if r['标题'] is not None: + r['标题']=r['标题'].replace('\t',' ') + f.write('%s\t%s\n'%(r['信息'],r['标题'])) + + + + + + +"""https://clubajax.autohome.com.cn/topic/rv?ids=68636168%2C68483133%2C68337683%2C68538636%2C68769979%2C68107494%2C68727736%2C68662797%2C68641093%2C68542968%2C68526238%2C68335740%2C68144650%2C67887433%2C63603597%2C61320362%2C64789017%2C61320052%2C68788727%2C68690688%2C68793958%2C68734539%2C68108882%2C66747057%2C68740022%2C61420614%2C68396378%2C68793765%2C68399126%2C66709631%2C67093707%2C67668632%2C66511580%2C68519018%2C68699937%2C68706016%2C68324712%2C68539772%2C68725096%2C68652907%2C68788998%2C68154712%2C68770010%2C68783882%2C68784381%2C68752386%2C68788776%2C68793267%2C68763771%2C67551583%2C68782222%2C68792798%2C68791810%2C68165203%2C68792061%2C68788867%2C68785752%2C68792239%2C68722930%2C68783776%2C68718609%2C68790735%2C68775439%2C68719829%2C64485489%2C68780459%2C68780533%2C68771320%2C68791102%2C68670320%2C68791082%2C62935564%2C68703688%2C67832705%2C68755372%2C68637196%2C68751759%2C68318281%2C68749733%2C68769108%2C68757290%2C66806202%2C68781958%2C68790045%2C66281281%2C68697656%2C68784487%2C68464857%2C68787867%2C68789735%2C68788752%2C64795404%2C68776675%2C68217534%2C68785868%2C68422331%2C68784001%2C65515744%2C68742603%2C68786494%2C66011183%2C68768396%2C68771370%2C68752519%2C68774346%2C68784505%2C68782260%2C68721234%2C68774469%2C68694268%2C68766355%2C68208651%2C68784014%2C68767245%2C68787972%2C68765607%2C68759041%2C68787761%2C +""" \ No newline at end of file diff --git a/test/dianping.py b/test/dianping.py new file mode 100644 index 0000000..fc2f483 --- /dev/null +++ b/test/dianping.py @@ -0,0 +1,63 @@ +# coding=utf-8 +import os +import sys +import random +parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, parentdir) + + +from etlpy.extends import para_to_dict +from etlpy.multi_yielder import PROCESS_MODE,NORMAL_MODE, THREAD_MODE +from etlpy.proxy import get_proxy_all + + + +from etlpy.etlpy import * +from etlpy.params import request_param, Param + + + + + + + +url = 'https://www.dianping.com' +proxy =get_proxy_all() + + + + +cookie = '''Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 +Accept-Encoding:gzip, deflate +Accept-Language:zh-CN,zh;q=0.8,en;q=0.6 +Cache-Control:max-age=0 +Connection:keep-alive +Cookie:_hc.v=80a0369c-047c-ebd8-8b24-6b7203fff952.1483109843; __utma=1.669418081.1483109843.1488640538.1492677693.3; __utmz=1.1488640538.2.2.utmcsr=dianping.com|utmccn=(referral)|utmcmd=referral|utmcct=/search/keyword/2/0_%E5%9B%BD%E8%B4%B8; _lxsdk_cuid=15e5ba830aec8-08a516b6c56a2f-31627c01-13c680-15e5ba830aec8; _lxsdk=15e5ba830aec8-08a516b6c56a2f-31627c01-13c680-15e5ba830aec8; PHOENIX_ID=0a010725-15e65d95a1b-ef61ca9; cityid=3; share_ab=shop%3AA%3A3%7Cshopreviewlist%3AA%3A1%7Cmap%3AA%3A1; __mta=156143876.1504776309640.1507538981893.1507539061294.9; _lx_utm=utm_source%3Ddianping.com%26utm_medium%3Dreferral%26utm_content%3D%252Fsearch%252Fkeyword%252F2%252F0_%25E5%259B%25BD%25E8%25B4%25B8; s_ViewType=10; _lx_utm=utm_source%3Ddianping.com%26utm_medium%3Dreferral%26utm_content%3D%252Fsearch%252Fkeyword%252F2%252F0_%25E5%259B%25BD%25E8%25B4%25B8; JSESSIONID=BA60D084A3735A6D0F6A0C6B6A2C5AD4; aburl=1; cy=3; cye=hangzhou; _lxsdk_s=2e1185acd01e200ce0fad7a516ea%7C%7C4 +Host:www.dianping.com +Upgrade-Insecure-Requests:1 +User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36''' +headers = para_to_dict(cookie, '\n', ':') +r = request_param +r = r.merge('headers', headers)#.merge('proxies',Param({'http': lambda x:"http://{}".format(proxy) })) +t = task().create().url.set(url + '/search/category/3/75/g2878').get(r).pyq('.nc-contain')[0].pyq( + 'a').list().html().pl().cp('url:a').split('"')[1].get(r).pyq( + '#shop-all-list > ul > li').list().html().pl().tree() \ + .cp3('名称').xpath('//div[2]/div[1]/a[1]/h4')[0].text()\ + .cp3('点评').xpath('//div[2]/div[2]/a[1]/b')[0].text() \ + .cp3('平均').xpath('//div[2]/div[2]/a[2]/b')[0].text().num()[0] \ + .cp3('类型').xpath('//div[2]/div[3]/a[1]/span')[0].text() \ + .cp3('位置').xpath('//div[2]/div[3]/a[2]/span')[0].text() \ + .cp3('地址').xpath('//div[2]/div[3]/span')[0].text() \ + .cp3('效果').xpath('//div[2]/span/span[1]/b')[0].text() \ + .cp3('师资').xpath('//div[2]/span/span[2]/b')[0].text() \ + .cp3('环境').xpath('//div[2]/span/span[3]/b')[0].text() \ + .cp3('id').xpath('//@href')[0].cp('id:id0').split('/')[2].cp('id:html').let('html').get(r) \ + .cp3('phone').pyq('.phone')[0].text().let('html').cp('_:详细'). \ + pyq(' .con li').text().rm('a id0 html').let('id 点评').num()[0]. \ + mv('点评:点评数').phone.split(' ') + + +#print(t.rpc('clean',port=6067)) +#t.rpc('insert',port=6067) +for r in t.take(20).query(mode=[PROCESS_MODE]): + print(r.keys())