diff --git a/README.md b/README.md index e3cc213..67930fe 100644 --- a/README.md +++ b/README.md @@ -7,26 +7,26 @@ etlpy是基于配置文件的数据采集和清洗工具。 写爬虫和数据清洗代码总是很烦人。因此,应该通过工具生成爬虫和数据清洗的代码! etlpy就是为了解决这个问题而生的。 -通过可视化和图形化设计工具,快速生成爬虫和数据清洗流程,并保存为xml文件,并由etlpy引擎解析它,即可获得最终的数据结果。 +通过可视化和图形化设计工具(Hawk),快速生成爬虫和数据清洗流程,并保存为xml文件,并由etlpy引擎解析它,即可获得最终的数据结果。 ##2.使用 -使用起来非常简单: +使用起来非常简单执行main.py文件): ``` -from etl import ETLTool -tool = ETLTool(); -tool.LoadProject('project.xml', '数据清洗ETL-大众点评'); -datas = tool.RefreshDatas(); -for r in datas: - print(r) +from classInit import projectLoad +from classInit.projectExecutor import projExecute + +path = 'xmlFile' +project = projectLoad.Project_LoadXml(path + '/demo.xml') +print(project.modules) +proj = projExecute(project) +t = proj.projectFunction() + ``` -RefreshDatas函数返回的是生成器,通过for循环,即可自动读取所有数据。 ##3.基本原理 模块分为 生成,过滤,排序,转换,执行四种。 利用Python的生成器,可以将不同模块组织起来,定义一个流水线,数据(python的字典)会在流水线上被加工和消费。 -图形化工具是用C#开发的,使用了类似Python生成器的Linq技术。其原始思路来自于Lisp的s-表达式。 - ##4. 用途 爬虫,计算,清洗,任何符合一定计算范式的数据,都可以使用它来完成。 diff --git a/classInit/ETLTask.py b/classInit/ETLTask.py new file mode 100644 index 0000000..70dd9ca --- /dev/null +++ b/classInit/ETLTask.py @@ -0,0 +1,29 @@ +# -*- encoding: utf-8 -*- +""" +@File : ETLTask.py +@Time : 19/8/2019 08:57 +@Author : liyang + +数据清洗模块 +""" + + +class ETLTask(): + '''SmartETLTool(数据清洗)的子任务 + + ''' + + def __init__(self): + self.AllETLTools = [] + + +class ETLTool(): + def __init__(self): + self.Enabled = True + self.Column = '' + + def process(self, data): + return data + + def init(self): + pass diff --git a/classInit/ETLTool/Executor.py b/classInit/ETLTool/Executor.py new file mode 100644 index 0000000..bab0c39 --- /dev/null +++ b/classInit/ETLTool/Executor.py @@ -0,0 +1,91 @@ +# -*- encoding: utf-8 -*- +""" +@File : Executor.py +@Time : 19/8/2019 08:59 +@Author : liyang + +执行器 +""" +import sys + +sys.path.append('../') +from classInit.ETLTask import ETLTool + + +class Executor(ETLTool): + '''ETLTool的执行类组(在xml文件中为Group="Executor") + + ''' + + def execute(self, data): + pass + + def process(self, data): + for r in data: + self.execute(r) + yield r + + +def create(item): + ''' + 类实列化 + :param item: 待实例化的类名 + :return: 实例化后的类(对象) + ''' + return eval('%s()' % item) + + +class EtlEX(Executor): + pass + # def execute(self, datas): + # subetl = self.__proj__.modules[self.ETLSelector] + # for data in datas: + # if spider.IsNone(self.NewColumn): + # doc = data.copy() + # else: + # doc = {} + # extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column) + # result = (r for r in generate(subetl.AllETLTools, [doc])) + # count = 0 + # for r in result: + # count += 1 + # print(r) + # print(count) + # yield data + + +class TableEX(Executor): + '''写入数据表,将数据保存为EXCEL + + ''' + pass + # def __init__(self): + # super(TableEX, self).__init__() + # self.Table = 'Table' + # + # def execute(self, data): + # tables = self.__proj__.tables + # tname = self.Table + # if tname not in tables: + # tables[tname] = [] + # for r in data: + # tables[tname].append(r) + # yield r + + +class SaveFileEX(Executor): + def __init__(self): + super(SaveFileEX, self).__init__() + self.SavePath = '' + + def execute(self, data): + pass + # save_path = extends.Query(data, self.SavePath) + # (folder, file) = os.path.split(save_path) + # if not os.path.exists(folder): + # os.makedirs(folder) + # urllib.request.urlretrieve(data[self.Column], save_path) + + +class DbEX(Executor): + pass diff --git a/classInit/ETLTool/Filter.py b/classInit/ETLTool/Filter.py new file mode 100644 index 0000000..fb75bbf --- /dev/null +++ b/classInit/ETLTool/Filter.py @@ -0,0 +1,97 @@ +# -*- encoding: utf-8 -*- +""" +@File : Filter.py +@Time : 19/8/2019 08:59 +@Author : liyang +""" +import sys + +sys.path.append('../') +from classInit.ETLTask import ETLTool +import re + + +class Filter(ETLTool): + '''ETLTool的过滤类组(在xml文件中为Group="Filter") + + ''' + + def __init__(self): + super(Filter, self).__init__() + self.Revert = False + + def filter(self, data): + + return True + + def process(self, data): + for r in data: + item = None + if self.Column in r: + item = r[self.Column] + if item is None and self.__class__ != NullFT: + continue + result = self.filter(item) + if result == True and self.Revert == False: + yield r + elif result == False and self.Revert == True: + yield r + + +def create(item): + ''' + 类实列化 + :param item: 待实例化的类名 + :return: 实例化后的类(对象) + ''' + return eval('%s()' % item) + + +class RegexFT(Filter): + + def init(self): + self.Regex = re.compile(self.Script) + self.Count = 1 + + def filter(self, data): + v = self.Regex.findall(data) + if v is None: + return False + else: + return self.Count <= len(v) + + +class RangeFT(Filter): + + def filter(self, item): + f = float(item) + return self.Min <= f <= self.Max + + +class RepeatFT(Filter): + + def init(self): + self.set = set() + + def filter(self, data): + if data in self.set: + return False + else: + self.set.add(data) + return True + + +class NullFT(Filter): + '''空对象过滤器 + + ''' + def filter(self, data): + if data is None: + return False + if isinstance(data, str): + return data.strip() != '' + return True + + +class NumRangeFT(Filter): + pass diff --git a/classInit/ETLTool/Generator.py b/classInit/ETLTool/Generator.py new file mode 100644 index 0000000..538aad8 --- /dev/null +++ b/classInit/ETLTool/Generator.py @@ -0,0 +1,122 @@ +# -*- encoding: utf-8 -*- +""" +@File : Generator.py +@Time : 19/8/2019 08:55 +@Author : liyang + +生成器 +""" + +import sys + +sys.path.append('../') +from classInit.ETLTask import ETLTool + + +class Generator(ETLTool): + '''ETLTool的生成类组(在xml文件中为"Group="Generator") + + 生成数据清洗的工具。eg.ETLTool类型为RangeGE + ''' + + def __init__(self): + # 继承父类的初始化 + super(Generator, self).__init__() + # 初始化合并类型为append + self.MergeType = 'Append' + # 初始化位置为0 + self.Position = 0 + + def generate(self, generator): + pass + + def process(self, generator): + pass + # if generator is None: + # return self.generate(None) + # else: + # if self.MergeType == 'Append': + # return extends.Append(generator, self.process(None)) + # elif self.MergeType == 'Merge': + # return extends.Merge(generator, self.process(None)) + # else: + # return extends.Cross(generator, self.generate) + + +def create(item): + ''' + 类实列化 + :param item: 待实例化的类名 + :return: 实例化后的类(对象) + ''' + return eval('%s()' % item) + + +class RangeGE(Generator): + '''数据清洗任务中数据清洗工具(ETLTool)类型(Type)为RangeGE的处理。 + + 继承于生成类,生成区间数 + ''' + + def __init__(self): + super(RangeGE, self).__init__() + # 初始化设置间隔数 + self.Interval = '1' + # 初始化设置数的最大值 + self.MaxValue = '1' + # 初始化最小值 + self.MinValue = '1' + + def generate(self): + items = [] + # 生成由最小值到最大值,间隔为Interval的int序列 + interval = int(self.Interval) + maxvalue = int(self.MaxValue) + minvalue = int(self.MinValue) + # 包括最大值 + for i in range(minvalue, maxvalue + 1, interval): + item = {self.Column: round(i, 5)} + items.append(item) + return items + # yield item + + +class EtlGE(Generator): + '''子任务生成 + + ''' + # def generate(self, data): + # subetl = self.__proj__.modules[self.ETLSelector] + # for r in generate(subetl.AllETLTools): + # yield r + pass + + +class TextGE(Generator): + '''从文本生成。 + + 直接导入url,若导入url必须有'https://'或'http://' + ''' + + def __init__(self): + super(TextGE, self).__init__() + self.Content = '' + def generate(self): + result = [] + self.arglists = [r.strip() for r in self.Content.split('\n')] + for i in range(self.Position, len(self.arglists)): + result.append({self.Column: self.arglists[i]}) + return result + # yield + + +class BfsGE(Generator): + pass + + +class FolderGE(Generator): + pass + + +class TableGE(Generator): + pass diff --git a/classInit/ETLTool/Transformer.py b/classInit/ETLTool/Transformer.py new file mode 100644 index 0000000..a21dcd9 --- /dev/null +++ b/classInit/ETLTool/Transformer.py @@ -0,0 +1,525 @@ +# -*- encoding: utf-8 -*- +""" +@File : Transformer.py +@Time : 19/8/2019 08:58 +@Author : liyang + +转换器 +""" +import sys + +sys.path.append('../') +from classInit.ETLTask import ETLTool +from classInit import spider +import os +import re +import json + + +class Transformer(ETLTool): + '''ETLTool的转换类组(在xml文件中为 Group="Transformer") + + ''' + + def __init__(self): + # 继承父类的初始化 + super(Transformer, self).__init__() + # 初始化是否为多重返回数据为False + self.IsMultiYield = False + self.NewColumn = '' + self.OneOutput = True + self.OneInput = False + + def transform(self, data, project): + pass + + def process(self, data, project): + pass + # if self.IsMultiYield: # one to many + # for r in data: + # for p in self.transform(r): + # yield extends.MergeQuery(p, r, self.NewColumn) + # return + # for d in data: # one to one + # if self.OneOutput: + # if self.Column not in d or self.Column not in d: + # yield d + # continue + # item = d[self.Column] if self.OneInput else d + # res = self.transform(item) + # key = self.NewColumn if self.NewColumn != '' else self.Column + # d[key] = res + # else: + # self.transform(d) + # yield d + + +def create(item): + ''' + 类实列化 + :param item: 待实例化的类名 + :return: 实例化后的类(对象) + ''' + + return eval('%s()' % item) + + +class AddNewTF(Transformer): + + def transform(self, data, project): + return self.NewValue + + +class AutoIndexTF(Transformer): + def init(self): + super(AutoIndexTF, self).__init__() + self.currindex = 0 + + def transform(self, data, project): + self.currindex += 1 + return self.currindex + + +class RenameTF(Transformer): + '''列名修改器 + + ''' + + def __init__(self): + super(RenameTF, self).__init__() + self.OneOutput = False + + def transform(self, data, project): + if not self.Column in data: + return + item = data[self.Column] + del data[self.Column] + if self.NewColumn != "": + data[self.NewColumn] = item + + +class DeleteTF(Transformer): + '''删除该列 + + ''' + + def __init__(self): + super(DeleteTF, self).__init__() + self.OneOutput = False + + def transform(self, data, project): + # 遍历data,data为字典 + for key, value in data.items(): + # 遍历data内的数据 + for i in range(len(value)): + if self.Column in value[i]: + del value[i][self.Column] + + +class HtmlTF(Transformer): + '''html字符转义 + + ''' + pass + # def __init__(self): + # super(HtmlTF, self).__init__() + # self.OneInput = True + # + # def transform(self, data, project): + # return html.escape(data) if self.ConvertType == 'Encode' else html.unescape(data) + + +class UrlTF(Transformer): + pass + # def __init__(self): + # super(UrlTF, self).__init__() + # self.OneInput = True + # + # def transform(self, data, project): + # if self.ConvertType == 'Encode': + # url = data.encode('utf-8') + # return urllib.parse.quote(url) + # else: + # return urllib.parse.unquote(data) + + +class RegexSplitTF(Transformer): + def transform(self, data, project): + items = re.split(self.Regex, data) + if len(items) <= self.Index: + return data + if not self.FromBack: + return items[self.Index] + else: + index = len(items) - self.Index - 1 + if index < 0: + return data + else: + return items[index] + return items[index] + + +class SplitPageTF(Transformer): + '''对输入列进行分页 + + ''' + pass + + +class MergeTF(Transformer): + def __init__(self): + super(MergeTF, self).__init__() + self.Format = '{0}' + self.MergeWith = '' + + def transform(self, data, project): + if self.MergeWith == '': + columns = [] + else: + columns = [str(data[r]) for r in self.MergeWith.split(' ')] + # columns.insert(0, data[self.Column] if self.Column in data else ''); + result = [] + columns = data[self.Column] + res = self.Format + for i in columns: + result.append({self.NewColumn: res.format(str(i[self.Column]))}) + return result + + +class RegexTF(Transformer): + '''正则转换器 + + ''' + + def __init__(self): + super(RegexTF, self).__init__() + self.Script = '' + self.OneInput = True + + # def init(self): + self.Regex = re.compile(self.Script) + + def transform(self, data, project): + item = re.findall(self.Regex, str(data)) + if self.Index < 0: + return '' + if len(item) <= self.Index: + return '' + else: + r = item[self.Index] + return r if isinstance(r, str) else r[0] + + +class ReReplaceTF(RegexTF): + + def transform(self, data, project): + return re.sub(self.Regex, self.ReplaceText, data) + + +class NumberTF(RegexTF): + '''提取数字 + + ''' + + def __init__(self): + super(NumberTF, self).__init__() + self.Script = '' # TODO + + def transform(self, data, project): + t = super(NumberTF, self).transform(data, project) + if t is not None and t != '': + return int(t) + return t + + +class SplitTF(Transformer): + '''字符串分割 + + ''' + + def __init__(self): + super(SplitTF, self).__init__() + self.SplitChar = '' + self.OneInput = True + + def transform(self, data, project): + splits = self.SplitChar.split(' ') + sp = splits[0] + if sp == '': + return data + + r = data.split(splits[0]) + if len(r) > self.Index: + return r[self.Index] + return '' + + +class TrimTF(Transformer): + def __init__(self): + super(TrimTF, self).__init__() + self.OneInput = True + + def transform(self, data, project): + return data.strip() + + +class StrExtractTF(Transformer): + def __init__(self): + super(StrExtractTF, self).__init__() + self.HaveStartEnd = False + self.Start = '' + self.OneInput = True + self.End = '' + + def transform(self, data, project): + start = data.find(self.Former) + if start == -1: + return + end = data.find(self.End, start) + if end == -1: + return + if self.HaveStartEnd: + end += len(self.End) + if not self.HaveStartEnd: + start += len(self.Former) + return data[start:end] + + +class PythonTF(Transformer): + '''python转换器 + + ''' + + def __init__(self): + super(PythonTF, self).__init__() + self.OneOutput = False + self.Script = 'value' + self.ScriptWorkMode = '不进行转换' + + def transform(self, data, project): + # for d in data: + # for i in d + start = data[self.Column] + # 语句self.Script的最后一个参数,用作运算的参数 + end_index = re.findall('[a-zA-Z0-9]+', self.Script)[-1] + end = data[end_index] + result = [] + print(start) + for i in range(len(start)): + result.append(eval(self.Script, start[i], end[i])) + if result is not None and self.IsMultiYield == False: + key = self.NewColumn if self.NewColumn != '' else self.Column + data[key] = result + return result + + +class CrawlerTF(Transformer): + ''' + 从爬虫转化 + ''' + + def __init__(self): + super(CrawlerTF, self).__init__() + self.CrawlerSelector = '' + self.MaxTryCount = 1 + self.IsRegex = False + self.OneOutput = False + + def init(self, project): + ''' + 根据工程初始化爬虫 + :param project: 项目工程目录 + :return: + ''' + self.IsMultiYield = True + self.crawler = project.modules.get(self.CrawlerSelector, None) + self.buff = {} + + def transform(self, data, project): + self.init(project) + crawler = self.crawler + # print(data) + headers = spider.setHttpItem(crawler.HttpItem) + # for d in data: + # # 爬虫返回的数据 + # html_data = spider.getURLdata(d, headers) + # # 根据xpath规则处理html, + # data = spider.processHtml(html_data, crawler.RootXPath, crawler.CrawItems) + # print(len(data)) + result = [] + for d in data[self.Column]: + html_data = spider.getURLdata(d[self.Column], headers) + result.append(spider.processHtml(html_data, crawler.RootXPath, crawler.CrawItems)) + return result + + +class XPathTF(Transformer): + '''xpath转换器 + + ''' + + def __init__(self): + super(XPathTF, self).__init__() + self.XPath = '' + self.IsMultiYield = True + self.OneOutput = False + + def init(self): + self.IsMultiYield = True + self.OneOutput = False + + def transform(self, data, project): + pass + # if self.IsManyData: + # tree = spider.GetHtmlTree(data[self.Column]) + # nodes = tree.xpath(self.XPath) + # for node in nodes: + # ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')} + # ext['OHTML'] = ext['HTML'] + # yield extends.MergeQuery(ext, data, self.NewColumn) + # else: + # tree = spider.GetHtmlTree(data[self.Column]) + # nodes = tree.xpath(self.XPath) + # node = nodes[0] + # if hasattr(node, 'text'): + # setValue(data, self, node.text) + # else: + # setValue(data, self, str(node)) + # yield data + + +class ToListTF(Transformer): + '''启动并行 + + ''' + + def transform(self, data, project): + yield data + + +class JsonTF(Transformer): + def __init__(self): + super(JsonTF, self).__init__() + self.OneOutput = False + self.ScriptWorkMode = '文档列表' + + def init(self): + self.IsMultiYield = self.ScriptWorkMode == '文档列表' + + def transform(self, data, project): + js = json.loads(data[self.Column]) + if isinstance(js, list): + for j in js: + yield j + else: + yield js + + +class RangeTF(Transformer): + def __init__(self): + super(RangeTF, self).__init__() + self.Skip = 0 + self.Take = 9999999 + + def transform(self, data, project): + pass + # skip = int(extends.Query(data, self.Skip)) + # take = int(extends.Query(data, self.Take)) + # i = 0 + # for r in data: + # if i < skip: + # continue + # if i >= take: + # break + # i += 1 + # yield r + + +class EtlTF(Transformer): + pass + # def transform(self, datas): + # subetl = self.__proj__.modules[self.ETLSelector] + # if self.IsMultiYield: + # + # for data in datas: + # doc = data.copy() + # for r in subetl.__generate__(subetl.AllETLTools, [doc]): + # yield extends.MergeQuery(r, data, self.NewColumn) + # else: + # yield None # TODO + + +class BaiduLocation(Transformer): + pass + + +class GetIPLocation(Transformer): + pass + + +class GetRoute(Transformer): + pass + + +class NearbySearch(Transformer): + pass + + +class NlpTF(Transformer): + pass + + +class TransTF(Transformer): + pass + + +class JoinDBTF(Transformer): + pass + + +class RepeatTF(Transformer): + pass + + +class ResponseTF(Transformer): + pass + + +class Time2StrTF(Transformer): + pass + + +class DictTF(Transformer): + pass + + +class FileExistFT(Transformer): + def __init__(self): + super(FileExistFT, self).__init__() + self.Script = '' + self.OneInput = True + + def transform(self, data, project): + return str(os.path.exists(data)) + + +class MergeRepeatTF(Transformer): + pass + + +class DelayTF(Transformer): + '''延时函数,延时转换器''' + + pass + + +class ReadFileTextTF(Transformer): + pass + + +class WriteFileTextTF(Transformer): + pass + + +class FileDataTF(Transformer): + pass diff --git a/classInit/ETLTool/__init__.py b/classInit/ETLTool/__init__.py new file mode 100644 index 0000000..be6ad51 --- /dev/null +++ b/classInit/ETLTool/__init__.py @@ -0,0 +1,6 @@ +# -*- encoding: utf-8 -*- +""" +@File : __init__.py.py +@Time : 19/8/2019 09:36 +@Author : liyang +""" diff --git a/classInit/Project.py b/classInit/Project.py new file mode 100644 index 0000000..96fbfa2 --- /dev/null +++ b/classInit/Project.py @@ -0,0 +1,16 @@ +# -*- encoding: utf-8 -*- +""" +@File : Project.py +@Time : 19/8/2019 08:55 +@Author : liyang + +工程初始化 +""" + + +class Project(): + def __init__(self): + self.modules = {} + self.tables = {} + self.connectors = {} + self.__defaultdict__ = {} diff --git a/classInit/SmartCrawler.py b/classInit/SmartCrawler.py new file mode 100644 index 0000000..478236d --- /dev/null +++ b/classInit/SmartCrawler.py @@ -0,0 +1,132 @@ +# -*- encoding: utf-8 -*- +""" +@File : SmartCrawler.py +@Time : 19/8/2019 08:56 +@Author : liyang + +工程SmartCrawler初始化及设置模块 +""" + + +class SmartCrawler(): + def __init__(self): + self.IsMultiData = "List" + self.HttpItem = None + self.Name = None + self.CrawItems = None + self.Login = "" + self.haslogin = False + self.RootXPath = '' + self.Url = '' + + # def autologin(self, loginItem): + # if loginItem.postdata is None: + # return + # import http.cookiejar + # cj = http.cookiejar.CookieJar() + # pro = urllib.request.HTTPCookieProcessor(cj) + # opener = urllib.request.build_opener(pro) + # t = [(r, loginItem.Headers[r]) for r in loginItem.Headers] + # opener.addheaders = t + # binary_data = loginItem.postdata.encode('utf-8') + # op = opener.open(loginItem.Url, binary_data) + # data = op.read().decode('utf-8') + # print(data) + # self.HttpItem.Url = op.url + # return opener + + +class CrawItem(): + '''爬虫item的创建和初始化 + + ''' + + def __init__(self, name=None, sample=None, ismust=False, isHTMLorText=True, xpath=None): + self.XPath = xpath + self.Sample = sample + self.Name = name + self.IsMust = ismust + self.IsHTMLorText = isHTMLorText + self.Children = [] + + def __str__(self): + return "%s %s %s" % (self.Name, self.XPath, self.Sample) + + +class HTTPItem(): + def __init__(self): + self.Url = '' + self.Cookie = '' + self.Headers = None + self.Timeout = 30 + self.opener = "" + self.postdata = '' + + def PraseURL(self, url): + pass + # u = Para2Dict(urlparse(self.Url).query, '&', '=') + # for r in extract.findall(url): + # url = url.replace('[' + r + ']', u[r]) + # return url + + # def GetHTML(self, destUrl=None): + # if destUrl is None: + # destUrl = self.Url + # destUrl = self.PraseURL(destUrl) + # socket.setdefaulttimeout(self.Timeout) + # cj = http.cookiejar.CookieJar() + # pro = urllib.request.HTTPCookieProcessor(cj) + # opener = urllib.request.build_opener(pro) + # t = [(r, self.Headers[r]) for r in self.Headers] + # opener.addheaders = t + # binary_data = self.postdata.encode('utf-8') + # try: + # destUrl.encode('ascii') + # except UnicodeEncodeError: + # destUrl = iriToUri(destUrl) + # + # try: + # if self.postdata == '': + # page = opener.open(destUrl) + # else: + # page = opener.open(destUrl, binary_data) + # html = page.read() + # except Exception as e: + # print(e) + # return "" + # + # if page.info().get('Content-Encoding') == 'gzip': + # html = gzip.decompress(html) + # encoding = charset.search(str(html)) + # if encoding is not None: + # encoding = encoding.group(1) + # if encoding is None: + # encoding = 'utf-8' + # try: + # html = html.decode(encoding) + # except UnicodeDecodeError as e: + # print(e) + # import chardet + # encoding = chardet.detect(html) + # html = html.decode(encoding) + # + # return html + + +def Para2Dict(para, split1, split2): + '''对xml文件的Parameters进行转换,由string转换为dict + + :param para:参数str + :param split1:第一个分割 + :param split2:第二个分割 + :return:str转换后的字典 + ''' + r = {} + for s in para.split(split1): + rs = s.split(split2) + if len(rs) < 2: + continue + key = rs[0] + value = s[len(key) + 1:] + r[rs[0]] = value + return r diff --git a/classInit/__init__.py b/classInit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/classInit/mongodbSetting.py b/classInit/mongodbSetting.py new file mode 100644 index 0000000..b71ff69 --- /dev/null +++ b/classInit/mongodbSetting.py @@ -0,0 +1,55 @@ +# -*- encoding: utf-8 -*- +""" +@File : mongodbSetting.py +@Time : 20/8/2019 11:28 +@Author : liyang + +工程的MongoDB设置及处理模块 +""" +import pymongo + + +class mongo(): + def __init__(self): + self.host = '' + self.port = 27017 + self.client = '' + self.database = '' + self.collection = '' + + def connect(self, host, port, database, collection): + '''建立MongoDB数据库连接 + + :param host: 数据库地址 + :param port:数据库端口 + :param database:待插入数据的数据库 + :param collection:待插入数据的集合 + :return: + ''' + self.host = host + self.port = port + self.database = database + self.collection = collection + try: + self.client = pymongo.MongoClient(self.host, self.port) + except: + print('MongoDB connect error') + + def insert_one(self, data): + '''向指定数据库的指定集合插入一条数据 + + :param data: 待插入的数据 + :return: + ''' + + try: + db = self.client[self.database] + collection = db[self.collection] + # with open('outFile/SmartCrawler.json', 'r', encoding='utf-8')as f: + # temp = json.loads(f.read()) + # print(temp) + # collection = client['test'].contents + result = collection.insert_one(data) + print(result) + except: + print('Database insert_one data error') diff --git a/classInit/projectExecutor.py b/classInit/projectExecutor.py new file mode 100644 index 0000000..78739d7 --- /dev/null +++ b/classInit/projectExecutor.py @@ -0,0 +1,90 @@ +# -*- encoding: utf-8 -*- +""" +@File : projectExecutor.py +@Time : 19/8/2019 11:19 +@Author : liyang + +工程执行器 +""" + +from classInit.mongodbSetting import mongo +from . import spider +from classInit.ETLTool import Transformer + + +class projExecute(): + + def __init__(self, project): + '''初始化工程 + + :param project: 传入工程项目 + ''' + self.project = project + self.modules = project.modules + self.tables = project.tables + self.connectors = project.connectors + self.__defaultdict__ = project.__defaultdict__ + # print(self.__defaultdict__) + + def saveDataToDB(self, data): + '''将数据存入MongoDB + + :param data: json格式的数据 + :return: + ''' + # 初始化mongo class + conn = mongo() + # 建立MongoDB连接 + c = conn.connect('139.196.85.202', 27017, 'test1', 'contents') + # 数据插入 + conn.insert_one(data) + + def projectFunction(self): + ''' + 迭代project的modules,并顺序执行相应功能(item) + :return: + ''' + for module_name, module in self.project.modules.items(): + module_type = str(module).split('.')[1] + # 如果为爬虫模块 + if module_type == 'SmartCrawler': + + # 爬虫提取的xpath规则 + # items = spider.setCrawItems(module.CrawItems) + # 爬虫headers + headers = spider.setHttpItem(module.HttpItem) + # 爬虫返回的数据 + html_data = spider.getURLdata(module.Url, headers) + # 根据xpath规则处理html, + data = spider.processHtml(html_data, module.RootXPath, module.CrawItems) + # 将数据存入MongoDB + # self.saveDataToDB(data) + print('SmartCrawler end!!!') + # 为数据清洗模块 + elif module_type == 'ETLTask': + # 迭代模块的各个工具(ETLTool) + # 生成器生成数据 + ges = {} + # 生成器生成标志符 + for tool in module.AllETLTools: + type = str(tool).split('.')[2] + # result为返回值 + if type == 'Generator': + # 接受生成器返回的数据 + # 如果数从文本生成 + # if isinstance(tool,Generator.TextGE): + # url = tool.generate() + # else: + ges[tool.Column] = tool.generate() + elif type == 'Transformer': + # 如果类型为DeleteTF,删除该列 + if isinstance(tool,Transformer.DeleteTF): + tool.transform(ges, self.project) + ges[tool.Column] = tool.transform(ges, self.project) + print(ges[tool.Column]) + elif type == 'Executor': + pass + elif type == 'Filter': + pass + + print('ETLTASK end!!!') diff --git a/classInit/projectLoad.py b/classInit/projectLoad.py new file mode 100644 index 0000000..51dacfb --- /dev/null +++ b/classInit/projectLoad.py @@ -0,0 +1,240 @@ +# -*- encoding: utf-8 -*- +""" +@File : projectLoad.py +@Time : 18/8/2019 18:55 +@Author : liyang + +XML工程导入模块 +""" +import copy +import re +import xml.etree.ElementTree as ET + +from classInit import ETLTask +from classInit import Project +from classInit import SmartCrawler +from classInit.ETLTool import Executor +from classInit.ETLTool import Filter +from classInit.ETLTool import Generator +from classInit.ETLTool import Transformer + +# value为int的参数 +intattrs = re.compile('Max|Min|Count|Index|Interval|Position') +# value为bool的参数 +boolre = re.compile('^(One|Can|Is)|Enable|Should|Have|Revert') +rescript = re.compile('Regex|Number') + + +def SetAttr(etl, key, value): + '''通过传入参数设置标签参数 + + :param etl: 标签对象 + :param key: 标签参数key + :param value: 标签参数value + :return: + ''' + # 如果key为'Group'或'Type',返回空 + if key in ['Group', 'Type']: + return + + # 根据正则表达式搜索key + if intattrs.search(key) is not None: + # 将对应的value转换为int + try: + t = int(value) + # 设置对象属性value为t + setattr(etl, key, t) + except ValueError: + # 返回值错误 + print('it is a ValueError') + setattr(etl, key, value) + elif boolre.search(key) is not None: + setattr(etl, key, True if value == 'True' else False) + else: + setattr(etl, key, value) + + +def get_type_name(obj): + ''' + # + :param obj: 传入类,eg. + :return:类的类型 eg. PythonTF + ''' + s = str(obj.__class__) + p = s.split('.') + r = p[-1].split('\'')[0] + return r + + +def etl_factory(item, proj): + '''将item及ietm的参数添加到工程 + + :param item:传入项目对象 + :param proj:传入工程对象 + :return:item对象 + ''' + + if isinstance(item, str): + # item的类型 + type = item[-2:] + if type == 'GE': + item = Generator.create(item) + elif type == 'TF': + item = Transformer.create(item) + elif type == 'EX': + item = Executor.create(item) + elif type == 'FT': + item = Filter.create(item) + + else: + item = item + # 获取item类型 + + name = get_type_name(item) + # 添加name到工程工作字典,值为item的参数 + if name not in proj.__defaultdict__: + proj.__defaultdict__[name] = copy.deepcopy(item.__dict__) + return item + + +def GetChildNode(roots, name): + '''获取子节点标签名为name的标签,并将该标签返回 + + :param roots: 父节点 + :param name: 子节点标签名 + :return: 子节点标签(若有),None(若无) + ''' + for etool in roots: + if etool.get('Name') == name or etool.tag == name: + return etool + return None + + +def InitFromHttpItem(config, item): + '''爬虫的http连接设置初始化 + + :param config: 爬虫连接设置 + :param item:带添加设置的item + ''' + httprib = config.attrib + # 将参数由string格式转换为字典 + paras = SmartCrawler.Para2Dict(httprib['Parameters'], '\n', ':') + # 获取item设置的header + if 'User-Agent' in paras.keys(): + # xml文件中user-agent多了一个空格,eg.' : Mozilla/5.0 (X11' + headers = paras['User-Agent'].strip() + item.Headers = {'User-Agent': headers} + # print(item.Headers ) + # 获取item的cookie + # pass + + # 获取item的url + item.Url = httprib['URL'] + # 获取item的post数据(若有) + post = 'Postdata' + if post in httprib: + item.postdata = httprib[post] + else: + item.postdata = None + + +def Project_LoadXml(path): + '''导入xml工程文件 + + :param path:工程文件路径 + :return:是否读取xml文件成功 + ''' + # 读取xml文件 + tree = ET.parse(path) + # 初始化工程 + proj = Project.Project() + + def factory(obj): + '''声明obj + + :param obj: 传入对象 + :return:etl_factory返回的obj对象 + ''' + return etl_factory(obj, proj) + + # 获取根节点 + root = tree.getroot() + # 获取doc节点 + root = root.find('Doc') + # 迭代遍历doc节点 + for etool in root: + # 获取children标签 + if etool.tag == 'Children': + # 获取该标签类型 + etype = etool.get('Type') + # 获取该标签名 + name = etool.get('Name') + # 标签为数据清理工具 + if etype == 'SmartETLTool': + # 生成ETLTask任务,并创建ETLTask对象 + etltool = factory(ETLTask.ETLTask()) + # 迭代SmartETLTool标签 + for m in etool: + if m.tag == 'Children': + # 获取标签类型 + type = m.attrib['Type'] + # 生成type型类,并创建该type类型对象 + etl = factory(type) + # 添加对象的工程为文件初始工程 + etl.__proj__ = proj + # 迭代标签参数 + for att in m.attrib: + # 传入标签对象,标签参数名,标签参数值 + SetAttr(etl, att, m.attrib[att]) + # 将标签参数添加到ETLTask对象中添加标签 + etltool.AllETLTools.append(etl) + # 将此SmartETLTool任务添加到工程 + proj.modules[name] = etltool + # 标签为爬虫 + elif etype == 'SmartCrawler': + # 生成crawler对象,并将该对象加入到工程对象proj + crawler = factory(SmartCrawler.SmartCrawler()) + # 生成crawler的HttpItem对象并将该对象加入到工程对象proj + crawler.HttpItem = factory(SmartCrawler.HTTPItem()) + crawler.Name = etool.attrib['Name'] + crawler.Url = etool.attrib['URL'] + # 爬虫的url请求数目(one和list) + crawler.IsMultiData = etool.attrib['IsMultiData'] + # 爬虫xpath的主xpath(若有) + if 'RootXPath' in etool.attrib: + crawler.RootXPath = etool.attrib['RootXPath'] + else: + crawler.RootXPath = '' + # 获取子节点,如果子节点的标签为HttpSet,代表http连接的设置 + httpconfig = GetChildNode(etool, 'HttpSet') + # 对爬虫的http设置进行初始化 + InitFromHttpItem(httpconfig, crawler.HttpItem) + # 对爬虫的login标签进行设置(若有) + login = GetChildNode(etool, 'Login') + if login is not None: + crawler.Login = factory(SmartCrawler.HTTPItem()) + InitFromHttpItem(login, crawler.Login) + # 获取该节点的所有爬虫标签的参数设置,每个标签为一个spider.CrawItem() + crawler.CrawItems = [] + # 遍历etool的children标签 + for child in etool: + if child.tag == 'Children': + # 生成一个爬虫item + crawitem = factory(SmartCrawler.CrawItem()) + crawitem.Name = child.attrib['Name'] + crawitem.XPath = child.attrib['XPath'] + crawler.CrawItems.append(crawitem) + # 将SmartCrawler任务添加到proj中 + proj.modules[name] = crawler + # 如果标签类型为数据库连接 + elif etool.tag == 'DBConnections': + pass + # for tool in etool: + # if tool.tag == 'Children': + # connector = extends.EObject() + # # 为数据库连接添加属性 + # for att in tool.attrib: + # SetAttr(connector, att, tool.attrib[att]) + # proj.connectors[connector.Name] = connector + print('load project success') + return proj diff --git a/classInit/spider.py b/classInit/spider.py new file mode 100644 index 0000000..981e314 --- /dev/null +++ b/classInit/spider.py @@ -0,0 +1,102 @@ +# -*- encoding: utf-8 -*- +""" +@File : spider.py +@Time : 19/8/2019 14:19 +@Author : liyang + +工程的爬虫模块 +""" +import re + +import requests +from lxml import etree + + +def setCrawItems(CrawItems): + ''' + 设置爬虫规则(xpath) + :param CrawItems: 爬虫规则 + :return: + ''' + items = [] + for item in CrawItems: + name = str(item.Name) + xpath = str(item.XPath) + i = [name, xpath] + items.append(item) + return items + + +def setHttpItem(HttpItem): + ''' + 设置http访问参数 + :param HttpItem: + :return: + ''' + return HttpItem.Headers + + +def getURLdata(url, headers): + """根据url爬取html + + :param url: 爬取的url + :return: html文本(正常),"get URL data error"(异常) + """ + try: + + r = requests.get(url, headers=headers, timeout=30) + # 如果状态码不是200 则应发HTTPError异常 + r.raise_for_status() + # 设置编码 + # r.encoding = r.apparent_encoding + return r.text + except: + return "get URL data error" + + +def saveData(title, data): + """储存html至指定文件夹 + + :param title: 文件标题 + :param data: 文件内容 + :return: 无 + """ + file = 'htmlFile/' + title + '.html' + with open(file, 'w', encoding='utf-8')as f: + f.write(data) + + +def processHtml(html, RootXPath, items): + """对html文件进行XPath分析提取 + + :param html: 传入的html文本 + :param RootXPath: xpath的主xpath(查询xpath=RootXPath+xpath) + :param items: 所需提取的字段(list),根据item.Name,item.xpath获取标题和xpth + :return: result (html对xpath的数据提取,list[dict{name:data}]) + """ + html = etree.HTML(html) + # result = etree.tostring(html) + result = {} + for i in items: + + # 对xpath出现‘#text[1]’进行特殊处理 + xpath = (RootXPath + i.XPath) + if xpath.find('#text[1]'): + index = re.search('text.*', xpath) + if index: + index = index.group(0) + # 提取text[12] 中的12 + # 尚未使用 + index = re.search('[0-9]+', index)[0] + xpath = re.sub('#text.[0-9]*.', '/text()', xpath) + + data = [] + for d in html.xpath(xpath): + # 如果返回为字符串,不改变 + if isinstance(d, str): + data.append(d) + # 否则提取元素的string + else: + data.append(d.xpath('string()')) + result[i.Name] = data + return result diff --git a/distributed.py b/distributed.py deleted file mode 100644 index 457bd8f..0000000 --- a/distributed.py +++ /dev/null @@ -1,139 +0,0 @@ -import sys; -from queue import Queue -from multiprocessing.managers import BaseManager -import etl; -import json -import extends; -import time; -authkey= "etlpy".encode('utf-8') -timeout=1; -rpc_port=8888 - -class ETLJob: - def __init__(self,project,jobname,config,id): - self.project= project; - self.jobname=jobname; - self.config=config; - self.id= id; - -class JobResult: - def __init__(self,name,count,id): - self.name=name; - self.count=count; - self.id=id; - -class Master: - - def __init__(self,project,jobname): - # 派发出去的作业队列 - self.dispatched_job_queue = Queue() - # 完成的作业队列 - self.finished_job_queue = Queue() - self.project= project; - self.jobname=jobname; - self.maxprocess= 10; - - def get_dispatched_job_queue(self): - return self.dispatched_job_queue - - def get_finished_job_queue(self): - return self.finished_job_queue - - def start(self,skip=0): - # 把派发作业队列和完成作业队列注册到网络上 - BaseManager.register('get_dispatched_job_queue', callable=self.get_dispatched_job_queue) - BaseManager.register('get_finished_job_queue', callable=self.get_finished_job_queue) - - # 监听端口和启动服务 - manager = BaseManager(address=('0.0.0.0', rpc_port), authkey=authkey) - manager.start() - - # 使用上面注册的方法获取队列 - dispatched_jobs = manager.get_dispatched_job_queue() - finished_jobs = manager.get_finished_job_queue() - - job_id = 0 - module= self.project.modules[self.jobname]; - - proj=json.loads(json.dumps(etl.convert_dict(self.project,self.project.__defaultdict__), ensure_ascii=False)) - while True: - for task in etl.parallel_map(module): - job_id = job_id + 1 - if job_id1: - ip=argv[1]; - if len(argv)>2: - port=int(argv[2]); - slave= Slave(); - slave.start(True,ip,port); - - diff --git a/etl.py b/etl.py deleted file mode 100644 index 47e21f8..0000000 --- a/etl.py +++ /dev/null @@ -1,975 +0,0 @@ -# coding=utf-8 -__author__ = 'zhaoyiming' -import re; -import extends -import urllib -import spider; -import json; -import html -import xml.etree.ElementTree as ET -import csv - -import os; - -intattrs = re.compile('Max|Min|Count|Index|Interval|Position'); -boolre = re.compile('^(One|Can|Is)|Enable|Should|Have|Revert'); -rescript = re.compile('Regex|Number') - - -def SetAttr(etl, key, value): - if key in ['Group','Type']: - return - - if intattrs.search(key) is not None: - try: - t = int(value); - setattr(etl, key, t); - except ValueError: - print('it is a ValueError') - setattr(etl, key, value); - elif boolre.search(key) is not None: - setattr(etl, key, True if value == 'True' else False); - else: - setattr(etl, key, value); - -def getMatchCount(mat): - return mat.lastindex if mat.lastindex is not None else 1; - -class ETLTool(extends.EObject): - def __init__(self): - self.Enabled=True; - self.Column = '' - def process(self, data): - return data - def init(self): - pass; - -class Transformer(ETLTool): - def __init__(self): - super(Transformer, self).__init__() - self.IsMultiYield=False - self.NewColumn=''; - self.OneOutput=True; - self.OneInput = False; - - def transform(self,data): - pass; - def process(self,data): - if self.IsMultiYield: # one to many - for r in data: - for p in self.transform( r): - yield extends.MergeQuery(p, r,self.NewColumn); - return; - for d in data: # one to one - if self.OneOutput: - if self.Column not in d or self.Column not in d: - yield d; - continue; - item = d[self.Column] if self.OneInput else d; - res = self.transform(item) - key= self.NewColumn if self.NewColumn!='' else self.Column; - d[key]=res; - else: - self.transform( d) - yield d; - -class Executor(ETLTool): - def execute(self,data): - pass; - def process(self,data): - for r in data: - self.execute(r); - yield r; - - -class Filter(ETLTool): - def __init__(self): - super(Filter, self).__init__() - self.Revert=False; - def filter(self,data): - - return True; - - def process(self, data): - for r in data: - item = None; - if self.Column in r: - item = r[self.Column]; - if item is None and self.__class__ != NullFT: - continue; - result = self.filter( item) - if result == True and self.Revert == False: - yield r; - elif result == False and self.Revert == True: - yield r; - -class Generator(ETLTool): - def __init__(self): - super(Generator, self).__init__() - self.MergeType='Append' - self.Position=0; - def generate(self,generator): - pass; - - def process(self, generator): - if generator is None: - return self.generate(None); - else: - if self.MergeType=='Append': - return extends.Append(generator,self.process(None)); - elif self.MergeType=='Merge': - return extends.Merge(generator, self.process(None)); - else: - return extends.Cross(generator,self.generate) - - - -class ConnectorBase(ETLTool): - def __init__(self): - super(ConnectorBase, self).__init__() - self.Connector = ''; - self.ExecuteType = 'OnlyInsert' - self.filetype = ''; - - def init(self): - self.connector= self.__proj__.connectors[self.Connector]; - if self.connector.TypeName=='MongoDBConnector': - import pymongo - client = pymongo.MongoClient(self.connector.ConnectString); - db = client[self.connector.DBName]; - self.Table = db[self.TableName]; - else: - path = self.TableName; - filetype = path.split('.')[-1].lower(); - encode = 'utf-8'; - self.file = open(path, type, encoding=encode) - self.filetype = filetype; - - -class DbEX(ConnectorBase): - def __init__(self): - super(DbEX, self).__init__() - self.TableName='' - - - - - def process(self,datas): - if self.connector.TypeName == 'MongoDBConnector': - etype = self.ExecuteType; - table = self.Table; - work = {'OnlyInsert': lambda d: table.save(d),'InsertOrUpdate':lambda d: table.save(d)}; - for data in datas: - work[etype](data); - yield data; - else: - - if self.filetype in ['csv', 'txt']: - field = extends.getkeys(datas); - self.writer = csv.DictWriter(self.file, field, delimiter=sp, lineterminator='\n') - self.writer.writeheader() - for data in datas: - self.writer.writerow(data); - yield data; - elif self.filetype == 'json': - self.file.write('[') - for data in datas: - json.dump(data, self.file, ensure_ascii=False) - self.file.write(','); - yield data; - self.file.write(']') - self.file.close(); - - -class DBGE(ConnectorBase): - - def generate(self,data): - if self.Connector=='MongoDBConnector': - for data in self.Table.find(): - yield data; - else: - if self.filetype in ['csv', 'txt']: - sp = ',' if self.filetype == 'csv' else '\t'; - reader = csv.DictReader(self.file, delimiter=sp) - for r in reader: - yield r; - elif self.filetype == 'json': - items = json.load(self.file); - for r in items: - yield r; - - def process(self, generator): - if generator is None: - return self.generate(None); - else: - if self.MergeType == 'Append': - return extends.Append(generator, self.process(None)); - elif self.MergeType == 'Merge': - return extends.Merge(generator, self.process(None)); - else: - return extends.Cross(generator, self.generate) - - -def setValue(data,etl,value): - if etl.NewColumn!='': - data[etl.NewColumn]=value; - else: - data[etl.Column]=value; - -class RegexFT(Filter): - - def init(self): - self.Regex = re.compile(self.Script); - self.Count=1; - - def filter(self,data): - v = self.Regex.findall(data); - if v is None: - return False; - else: - return self.Count <= len(v) - -class RangeFT(Filter): - - def filter(self,item): - f = float(item) - return self.Min <= f <= self.Max; - -class RepeatFT(Filter): - - def init(self): - self.set=set(); - def filter(self,data): - if data in self.set: - return False; - else: - self.set.add(data); - return True; - -class NullFT(Filter): - - def filter(self,data): - if data is None: - return False; - if isinstance(data, str): - return data.strip() != ''; - return True; - - -class AddNewTF(Transformer): - - def transform(self,data): - return self.NewValue; - - -class AutoIndexTF(Transformer): - def init(self): - super(AutoIndexTF, self).__init__() - self.currindex = 0; - def transform(self, data): - self.currindex += 1; - return self.currindex; - - -class RenameTF(Transformer): - - def __init__(self): - super(RenameTF, self).__init__() - self.OneOutput = False; - def transform(self, data): - if not self.Column in data: - return; - item = data[self.Column]; - del data[self.Column]; - if self.NewColumn != "": - data[self.NewColumn] = item; - -class DeleteTF(Transformer): - def __init__(self): - super(DeleteTF, self).__init__() - self.OneOutput = False; - def transform(self, data): - if self.Column in data: - del data[self.Column]; - -class HtmlTF(Transformer): - def __init__(self): - super(HtmlTF, self).__init__() - self.OneInput=True; - - def transform(self, data): - return html.escape(data) if self.ConvertType == 'Encode' else html.unescape(data); - - -class UrlTF(Transformer): - def __init__(self): - super(UrlTF, self).__init__() - self.OneInput = True; - def transform(self, data): - if self.ConvertType == 'Encode': - url = data.encode('utf-8'); - return urllib.parse.quote(url); - else: - return urllib.parse.unquote(data); - - -class RegexSplitTF(Transformer): - def transform(self, data): - items = re.split(self.Regex, data) - if len(items) <= self.Index: - return data; - if not self.FromBack: - return items[self.Index]; - else: - index = len(items) - self.Index - 1; - if index < 0: - return data; - else: - return items[index]; - return items[index]; - -class MergeTF(Transformer): - def __init__(self): - super(MergeTF, self).__init__() - self.Format='{0}' - self.MergeWith='' - def transform(self, data): - if self.MergeWith == '': - columns = []; - else: - columns = [str(data[r]) for r in self.MergeWith.split(' ')] - columns.insert(0, data[self.Column] if self.Column in data else ''); - res = self.Format; - for i in range(len(columns)): - res = res.replace('{' + str(i) + '}', str(columns[i])) - return res; - - - - -class RegexTF(Transformer): - def __init__(self): - super(RegexTF, self).__init__() - self.Script = ''; - self.OneInput = True; - - def init(self): - self.Regex = re.compile(self.Script); - def transform(self, data): - item = re.findall(self.Regex, str(data)); - if self.Index < 0: - return ''; - if len(item) <= self.Index: - return ''; - else: - r = item[self.Index]; - return r if isinstance(r, str) else r[0]; - -class ReReplaceTF(RegexTF): - - def transform(self, data): - return re.sub(self.Regex, self.ReplaceText, data); - -class NumberTF(RegexTF): - def __init__(self): - super(NumberTF, self).__init__() - self.Script='' #TODO - - def transform(self, data): - t = super(NumberTF,self).transform( data); - if t is not None and t != '': - return int(t); - return t; - -class SplitTF(Transformer): - def __init__(self): - super(SplitTF, self).__init__() - self.SplitChar=''; - self.OneInput = True; - - - def transform(self, data): - splits = self.SplitChar.split(' '); - sp = splits[0] - if sp == '': - return data; - - r = data.split(splits[0]); - if len(r) > self.Index: - return r[self.Index]; - return ''; - -class TrimTF(Transformer): - def __init__(self): - super(TrimTF, self).__init__() - self.OneInput = True; - - def transform(self, data): - return data.strip(); - -class StrExtractTF(Transformer): - def __init__(self): - super(StrExtractTF, self).__init__() - self.HaveStartEnd=False; - self.Start='' - self.OneInput=True; - self.End='' - - def transform(self, data): - start = data.find(self.Former); - if start == -1: - return - end = data.find(self.End, start); - if end == -1: - return; - if self.HaveStartEnd: - end += len(self.End); - if not self.HaveStartEnd: - start += len(self.Former); - return data[start:end]; - -class PythonTF(Transformer): - def __init__(self): - super(PythonTF, self).__init__() - self.OneOutput=False - self.Script='value' - self.ScriptWorkMode='不进行转换' - def transform(self, data): - result = eval(self.Script, {'value': data[self.Column]}, data); - if result is not None and self.IsMultiYield == False: - key = self.NewColumn if self.NewColumn != '' else self.Column; - data[key] = result; - return result; - -class CrawlerTF(Transformer): - def __init__(self): - super(CrawlerTF, self).__init__() - self.CrawlerSelector=''; - self.MaxTryCount=1; - self.IsRegex=False - self.OneOutput=False; - def init(self): - self.IsMultiYield = True; - self.crawler = self.__proj__.modules.get(self.CrawlerSelector, None); - self.buff = {}; - def transform(self, data): - crawler = self.crawler; - url = data[self.Column]; - buff = self.buff; - if url in buff: - datas = buff[url]; - else: - datas = crawler.CrawData(url); - if len(buff) < 100: - buff[url] = datas; - if self.crawler.IsMultiData == 'List': - for d in datas: - res = extends.MergeQuery(d, data, self.NewColumn); - yield res; - else: - data = extends.Merge(data, datas); - yield data; - - -class XPathTF(Transformer): - def __init__(self): - super(XPathTF, self).__init__() - self.XPath='' - self.IsMultiYield = True; - self.OneOutput=False; - - def init(self): - self.IsMultiYield=True; - self.OneOutput = False; - def transform(self, data): - from lxml import etree - if self.IsManyData: - tree = spider.GetHtmlTree(data[self.Column]); - nodes = tree.xpath(self.XPath); - for node in nodes: - ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; - ext['OHTML'] = ext['HTML'] - yield extends.MergeQuery(ext, data, self.NewColumn); - else: - tree = spider.GetHtmlTree(data[self.Column]); - nodes = tree.xpath(self.XPath); - node=nodes[0] - if hasattr(node,'text'): - setValue(data, self, node.text); - else: - setValue(data,self,str(node)) - yield data; - - -class ToListTF(Transformer): - def transform(self, data): - yield data; - -class JsonTF(Transformer): - def __init__(self): - super(JsonTF, self).__init__() - self.OneOutput=False - self.ScriptWorkMode='文档列表'; - - def init(self): - self.IsMultiYield= self.ScriptWorkMode=='文档列表'; - - def transform(self, data): - js = json.loads(data[self.Column]); - if isinstance(js, list): - for j in js: - yield j; - else: - yield js; - -class RangeGE(Generator): - def __init__(self): - super(RangeGE, self).__init__() - self.Interval='1' - self.MaxValue='1' - self.MinValue='1' - def generate(self,generator): - interval= int(extends.Query(generator,self.Interval)) - maxvalue= int(extends.Query(generator,self.MaxValue)) - minvalue= int(extends.Query(generator,self.MinValue)) - for i in range(minvalue,maxvalue,interval): - item= {self.Column:round(i,5)} - yield item; - -class RangeTF(Transformer): - def __init__(self): - super(RangeTF, self).__init__() - self.Skip=0; - self.Take=9999999; - def transform(self, data): - skip = int(extends.Query(data, self.Skip)); - take = int(extends.Query(data, self.Take)); - i = 0; - for r in data: - if i < skip: - continue; - if i >= take: - break; - i += 1; - yield r; - - -class EtlGE(Generator): - def generate(self,data): - subetl = self.__proj__.modules[self.ETLSelector]; - for r in generate(subetl.AllETLTools): - yield r; - -class EtlEX(Executor): - def execute(self,datas): - subetl = self.__proj__.modules[self.ETLSelector]; - for data in datas: - if spider.IsNone(self.NewColumn): - doc = data.copy(); - else: - doc = {}; - extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column); - result=(r for r in generate(subetl.AllETLTools, [doc])) - count=0; - for r in result: - count+=1; - print(r); - print(count) - yield data; - -class EtlTF(Transformer): - def transform(self,datas): - subetl = self.__proj__.modules[self.ETLSelector]; - if self.IsMultiYield: - - for data in datas: - doc = data.copy(); - for r in subetl.__generate__(subetl.AllETLTools, [doc]): - yield extends.MergeQuery(r, data, self.NewColumn); - else: - yield None; # TODO - - - -class TextGE(Generator): - def __init__(self): - super(TextGE, self).__init__() - self.Content=''; - def init(self): - self.arglists= [r.strip() for r in self.Content.split('\n')]; - def generate(self,data): - for i in range(self.Position, len(self.arglists)): - yield {self.Column: self.arglists[i]} - - - - - - -class TableEX(Executor): - def __init__(self): - super(TableEX, self).__init__() - self.Table = 'Table'; - def execute(self,data): - tables= self.__proj__.tables; - tname = self.Table; - if tname not in tables: - tables[tname] = []; - for r in data: - tables[tname].append(r); - yield r; - - - - - - - -class BaiduLocation(Transformer): - pass; - - -class GetIPLocation(Transformer): - pass; - -class GetRoute(Transformer): - pass; - -class NearbySearch(Transformer): - pass; - -class NlpTF(Transformer): - pass; - -class TransTF(Transformer): - pass; -class JoinDBTF(Transformer): - pass; - -class RepeatTF(Transformer): - pass; -class ResponseTF(Transformer): - pass; - -class Time2StrTF(Transformer): - pass; - - -class BfsGE(Generator): - pass; - -class DictTF(Transformer): - pass; - -class FileExistFT(Transformer): - def __init__(self): - super(FileExistFT,self).__init__(); - self.Script = ''; - self.OneInput = True; - def transform(self,data): - import os; - return str(os.path.exists(data)); - -class MergeRepeatTF(Transformer): - pass; - -class NumRangeFT(Filter): - pass; - -class DelayTF(Transformer): - pass; - -class ReadFileTextTF(Transformer): - pass; - -class WriteFileTextTF(Transformer): - pass; -class FolderGE(Generator): - pass; - -class TableGE(Generator): - pass; -class FileDataTF(Transformer): - pass; - - - -class SaveFileEX(Executor): - def __init__(self): - super(SaveFileEX, self).__init__() - self.SavePath=''; - - def execute(self,data): - - save_path = extends.Query(data, self.SavePath); - (folder,file)=os.path.split(save_path); - if not os.path.exists(folder): - os.makedirs(folder); - urllib.request.urlretrieve(data[self.Column], save_path) - - -def GetChildNode(roots, name): - for etool in roots: - if etool.get('Name') == name or etool.tag == name: - return etool; - return None; - - -def InitFromHttpItem(config, item): - httprib = config.attrib; - paras = spider.Para2Dict(httprib['Parameters'], '\n', ':'); - item.Headers = paras; - item.Url = httprib['URL']; - post = 'Postdata'; - if post in httprib: - item.postdata = httprib[post]; - else: - item.postdata = None; - - - - -class Project(extends.EObject): - def __init__(self): - self.modules={}; - self.tables={} - self.connectors={}; - self.__defaultdict__={}; - - -def LoadProject_dict(dic): - proj = Project(); - for key,connector in dic['connectors'].items(): - proj.connectors[key]= extends.dict_to_poco_type(connector); - for key,module in dic['modules'].items(): - task =None; - if 'AllETLTools' in module: - task = etl_factory(ETLTask(),proj); - for r in module['AllETLTools']: - etl= etl_factory(r['Type'],proj); - for attr,value in r.items(): - if attr in ['Type']: - continue; - setattr(etl,attr,value); - etl.__proj__=proj; - task.AllETLTools.append(etl) - elif 'CrawItems' in module: - task=etl_factory(spider.SmartCrawler(),proj); - task.CrawItems=[]; - extends.dict_copy_poco(task,module); - for r in module['CrawItems']: - crawlitem= etl_factory(spider.CrawItem(),proj) - extends.dict_copy_poco(crawlitem,r); - task.CrawItems.append(crawlitem) - task.HttpItem= etl_factory(spider.HTTPItem(),proj) - extends.dict_copy_poco(task.HttpItem,module['HttpItem']) - task.HttpItem.Headers=module['HttpItem']["Headers"]; - if task is not None: - proj.modules[key]=task; - - print('load project success') - return proj; - - -def task_DumpLinq(tools): - array=[]; - for t in tools: - typename= extends.get_type_name(t); - newcolumn=getattr(t,'NewColumn',''); - s='%s,%s'%(typename,t.Column); - s+='=>%s,'%newcolumn if newcolumn!='' else ','; - attrs=[]; - defaultdict= t.__proj__.__defaultdict__[typename]; - for att in t.__dict__: - value=t.__dict__[att]; - if att in ['NewColumn','Column','IsMultiYield']: - continue - if not isinstance(value,(str,int,bool,float)): - continue; - if value is None or att not in defaultdict or defaultdict[att]==value: - continue; - attrs.append('%s=%s'%(att,value)); - s+=','.join(attrs) - array.append(s) - return '\n'.join(array); - -def convert_dict(obj,defaultdict): - if not isinstance(obj, (str, int, float, list, dict, tuple, extends.EObject)): - return None -# if isinstance(obj,) - if isinstance(obj, extends.EObject): - d={} - typename = extends.get_type_name(obj); - - for key, value in obj.__dict__.items(): - if typename in defaultdict: - default = defaultdict[typename]; - if value== default.get(key,None): - continue; - if key.startswith('__'): - continue; - - p =convert_dict(value,defaultdict) - if p is not None: - d[key]=p - if isinstance(obj,ETLTool): - d['Type']= typename; - return d; - - elif isinstance(obj, list): - return [convert_dict(r,defaultdict) for r in obj]; - elif isinstance(obj,dict): - return {key: convert_dict(value,defaultdict) for key,value in obj.items()} - return obj; - - - - - return d - -def Project_DumpJson(proj): - dic= convert_dict(proj,proj.__defaultdict__) - return json.dumps(dic, ensure_ascii=False, indent=2) - - -def Project_LoadJson(js): - d=json.loads(js); - return LoadProject_dict(d) - -def etl_factory(item,proj): - if isinstance(item,str): - item=eval('%s()'%item); - else: - item=item; - import copy - name = extends.get_type_name(item) - if name not in proj.__defaultdict__: - proj.__defaultdict__[name]=copy.deepcopy( item.__dict__); - return item; - - -def Project_LoadXml(path): - tree = ET.parse(path); - proj=Project(); - def factory(obj): - return etl_factory(obj,proj); - root = tree.getroot(); - root = root.find('Doc'); - for etool in root: - if etool.tag == 'Children': - etype = etool.get('Type'); - name = etool.get('Name'); - if etype == 'SmartETLTool': - etltool = factory(ETLTask()); - for m in etool: - if m.tag == 'Children': - type= m.attrib['Type'] - etl = factory(type); - etl.__proj__=proj - for att in m.attrib: - SetAttr(etl, att, m.attrib[att]); - etltool.AllETLTools.append(etl); - proj.modules[name] = etltool; - elif etype == 'SmartCrawler': - import spider; - crawler =factory(spider.SmartCrawler()); - crawler.HttpItem= factory(spider.HTTPItem()) - crawler.Name = etool.attrib['Name']; - crawler.IsMultiData = etool.attrib['IsMultiData'] - crawler.RootXPath= etool.attrib['RootXPath'] - httpconfig = GetChildNode(etool, 'HttpSet'); - InitFromHttpItem(httpconfig, crawler.HttpItem); - login = GetChildNode(etool, 'Login'); - if login is not None: - crawler.Login = factory(spider.HTTPItem()); - InitFromHttpItem(login, crawler.Login); - crawler.CrawItems = []; - for child in etool: - if child.tag == 'Children': - crawitem= factory(spider.CrawItem()); - crawitem.Name=child.attrib['Name']; - crawitem.XPath = child.attrib['XPath']; - crawler.CrawItems.append(crawitem); - - proj.modules[name] = crawler; - elif etool.tag == 'DBConnections': - for tool in etool: - if tool.tag == 'Children': - connector = extends.EObject(); - for att in tool.attrib: - SetAttr(connector, att, tool.attrib[att]); - proj.connectors[connector.Name] = connector; - - print('load project success') - return proj; - - -def generate(tools, generator=None, execute=False, enabledFilter=True): - #print(task_DumpLinq(tools)); - for tool in tools: - if tool.Enabled == False and enabledFilter == True: - continue - tool.init(); - if isinstance(tool,Executor) and execute==False: - continue; - - generator = tool.process(generator) - return generator; - -def parallel_map(task, execute=True): - tools = task.AllETLTools; - index = extends.getindex(tools, lambda d: isinstance(d, ToListTF)); - if index == -1: - index = 0; - tool = tools[index]; - generator = tool.process(None); - else: - generator = generate(tools[:index],None, execute=execute); - return generator; - -def parallel_reduce(task,generator=None, execute=True): - tools = task.AllETLTools; - index = extends.getindex(tools, lambda d: isinstance(d,ToListTF)); - index =0 if index==-1 else index; - generator = generate(tools[index + 1:], generator, execute); - return generator; - - - - - - -class ETLTask(extends.EObject): - def __init__(self): - self.AllETLTools = []; - - - - def QueryDatas(self, etlCount=100, execute=False): - return generate((tool for tool in self.AllETLTools[:etlCount]), None, execute); - - def Close(self): - for tool in self.AllETLTools: - if tool.Type in ['DbGE', 'DbEX']: - if tool.connector.TypeName == 'FileManager': - if tool.filetype == 'json': - tool.file.write('{}]'); - tool.file.close(); - - - def mThreadExecute(self, threadcount=10,canexecute=True): - import threadpool - pool = threadpool.ThreadPool(threadcount) - - seed= parallel_map(self,canexecute); - def Funcs(item): - task= parallel_reduce(self,[item],canexecute); - print('totalcount: %d'%len([r for r in task])); - print('finish' + str(item)); - - requests = threadpool.makeRequests(Funcs, seed); - [pool.putRequest(req) for req in requests] - pool.wait() - # self.__close__() - - diff --git a/extends.py b/extends.py deleted file mode 100644 index 45f5a6a..0000000 --- a/extends.py +++ /dev/null @@ -1,128 +0,0 @@ -# encoding: UTF-8 -import re; - -spacere = re.compile("[ ]{2,}"); -spacern = re.compile("(^\r\n?)|(\r\n?$)") - - -def getkeys(generator): - count=0; - s=set(); - for r in generator: - s=s|r.keys(); - count+=1; - if count>=20: - return list(s); - return list(s) - -def ReplaceLongSpace(txt): - r = spacere.subn(' ', txt)[0] - r = spacern.subn('', r)[0] - return r; - - -def Merge(d1, d2): - for r in d2: - d1[r] = d2[r]; - return d1; - - -def MergeQuery(d1, d2, columns): - if isinstance(columns, str) and columns.strip() != "": - columns = columns.split(' '); - for r in columns: - if r in d2: - d1[r] = d2[r]; - return d1; - - - - -def Query(data, key): - if data is None: - return key; - if isinstance(key, str) and key.startswith('[') and key.endswith(']'): - key = key[1:-1]; - return data[key]; - return key; - - - - - -def findany(iteral, func): - for r in iteral: - if func(r): - return True; - return False; - - -def getindex(iteral, func): - for r in range(len(iteral)): - if func(iteral[r]): - return r; - return -1; - -def Cross(a, genefunc): - - for r1 in a: - for r2 in genefunc(r1): - for key in r2: - r1[key] = r2[key] - yield r1; - - -def MergeAll(a, b): - while True: - t1 = a.__next__() - if t1 is None: - return; - t2 = b.__next__() - if t2 is not None: - for t in t2: - t1[t] = t2[t]; - yield t1; - - -def Append(a, b): - for r in a: - yield r; - for r in b: - yield r; - -def get_type_name(obj): - s=str(obj.__class__); - p=s.find('.'); - r= s[p+1:].split('\'')[0] - return r; - - -class EObject(object): - pass; - - - -def convert_to_builtin_type(obj): - d= { key:value for key,value in obj.__dict__.items() if isinstance(value,(str,int,float,list,dict,tuple,EObject) or value is None)}; - return d - -def dict_to_poco_type(obj): - if isinstance(obj,dict): - result= EObject(); - for key in obj: - v= obj[key] - setattr(result,key,dict_to_poco_type(v)) - return result - elif isinstance(obj,list): - for i in range(len(obj)): - obj[i]=dict_to_poco_type(obj[i]); - - return obj; - - -def dict_copy_poco(obj,dic): - for key,value in obj.__dict__.items(): - if key in dic: - if isinstance(dic[key], (str,int,float)): - - setattr(obj,key,dic[key]) diff --git a/main.py b/main.py new file mode 100644 index 0000000..a4b68ef --- /dev/null +++ b/main.py @@ -0,0 +1,17 @@ +# -*- encoding: utf-8 -*- +""" +@File : main.py +@Time : 18/8/2019 18:55 +@Author : liyang + +工程执行main文件 +""" + +from classInit import projectLoad +from classInit.projectExecutor import projExecute + +path = 'xmlFile' +project = projectLoad.Project_LoadXml(path + '/demo.xml') +print(project.modules) +proj = projExecute(project) +t = proj.projectFunction() diff --git a/outFile/SmartCrawler.json b/outFile/SmartCrawler.json new file mode 100644 index 0000000..b812284 --- /dev/null +++ b/outFile/SmartCrawler.json @@ -0,0 +1,90 @@ +{ + "noresultRecommend_img_LOGCLICKDATA_vr_item_src": [ + "https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686", + "https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686" + ] +}, +{"noresultRecommend_img_LOGCLICKDATA_vr_item_class": ["lj-lazy", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "lj-lazy", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "vr_item", "lj-lazy", "vr_item", "vr_item", "vr_item"]}, +{ +"noresultRecommend_img_LOGCLICKDATA_lj-lazy_data-original": [ +"https://image1.ljcdn.com/110000-inspection/prod-4b341336-be62-4db2-b57e-5a26102ed699.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-eddc4c04-031c-43b8-b9aa-57a79883db5f.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-f2a3eb14-e5b3-4b85-8eb6-fc72407fc573.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-387d1411-3bb7-4a05-9c21-ccc2683bfc96.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-eb4806bd-bf20-4765-8aa6-01b3976ae697.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-692b3dd4-3dff-4517-8ae9-6084efe2b049.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-2a920848-c463-4a9c-8a7e-7b89221c5106.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-6c1e1381-f1d3-44c6-b257-b881c348f268.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-d704117b-f678-4f91-839c-b9adb8b6c643.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-c8a7ccef-84b4-42cd-a11f-2be5886f88e3.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-6a44d10e-9cdf-4f60-a0ff-d522cbc45f03.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-47caa008-a052-4d8a-85f9-58edaad3eea4.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-1154fd94-dd2b-4da1-899b-988a8fe6c35a.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/test-b5bbeefd-2fb4-452d-a752-ea015732e709.png.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-dcb1b6d0-629c-482e-ad63-cd51f0bbdd58.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-69ba93a0-d55c-4ee6-b125-04bf03bf2ba4.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-484fed7b-1801-4640-8c4d-94cba67e52e4.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-9582f142-2e3f-4282-95db-e100fe1d4cf9.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-024fd6b8-6760-4f75-9bf3-189f152a66b7.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-756ec8d2-ba26-4dae-abb9-756fa7dc1db3.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-4a5f973f-ea2f-40d7-bf88-605f7f578fed.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-ef191c81-16ec-42eb-a9d8-1b303d692fc8.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-2234ef15-0ab2-4dad-81da-96dc09f9f791.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-2346212e-061c-4510-85aa-9b37b68ad608.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-8703d387-e7b7-4d5a-9fbf-611374cb4ac1.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-e13379b0-06c6-4399-9d56-f76307508882.jpg.296x216.jpg", "https://image1.ljcdn.com/110000-inspection/prod-dc5419e3-ce76-46d6-a7e0-8316710edbfc.jpg.296x216.jpg" +]}, +{ +"noresultRecommend_img_LOGCLICKDATA_lj-lazy_alt": [ +"\u5317\u4eac\u897f\u57ce\u9676\u7136\u4ead", "\u5317\u4eac\u897f\u57ce\u897f\u56db", "\u5317\u4eac\u901a\u5dde\u6b66\u5937\u82b1\u56ed", "\u5317\u4eac\u671d\u9633\u77f3\u4f5b\u8425", "\u5317\u4eac\u901a\u5dde\u901a\u5dde\u5317\u82d1", "\u5317\u4eac\u660c\u5e73\u56de\u9f99\u89c2", "\u5317\u4eac\u901a\u5dde\u7389\u6865", "\u5317\u4eac\u6d77\u6dc0\u897f\u4e09\u65d7", "\u5317\u4eac\u6d77\u6dc0\u53cc\u6986\u6811", "\u5317\u4eac\u77f3\u666f\u5c71\u82f9\u679c\u56ed", "\u5317\u4eac\u671d\u9633\u5de5\u4f53", "\u5317\u4eac\u671d\u9633\u52b2\u677e", "\u5317\u4eac\u671d\u9633\u56e2\u7ed3\u6e56", "\u5317\u4eac\u671d\u9633\u4e9a\u8fd0\u6751\u5c0f\u8425", "\u5317\u4eac\u660c\u5e73\u56de\u9f99\u89c2", "\u5317\u4eac\u660c\u5e73\u5929\u901a\u82d1", "\u5317\u4eac\u671d\u9633\u5b89\u8d1e", "\u5317\u4eac\u671d\u9633\u77f3\u4f5b\u8425", "\u5317\u4eac\u897f\u57ce\u9676\u7136\u4ead", "\u5317\u4eac\u4e30\u53f0\u5218\u5bb6\u7a91", "\u5317\u4eac\u897f\u57ce\u5e7f\u5b89\u95e8", "\u5317\u4eac\u671d\u9633\u52b2\u677e", "\u5317\u4eac\u671d\u9633\u8c46\u5404\u5e84", "\u5317\u4eac\u671d\u9633\u751c\u6c34\u56ed", "\u5317\u4eac\u4e30\u53f0\u7389\u6cc9\u8425", "\u5317\u4eac\u660c\u5e73\u5929\u901a\u82d1", "\u5317\u4eac\u660c\u5e73\u970d\u8425" +]}, +{ +"clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_href": [ +"https://bj.lianjia.com/ershoufang/101105458455.html", "https://bj.lianjia.com/ershoufang/101105296451.html", "https://bj.lianjia.com/ershoufang/101105456335.html", "https://bj.lianjia.com/ershoufang/101105080586.html", "https://bj.lianjia.com/ershoufang/101104595999.html", "https://bj.lianjia.com/ershoufang/101105153614.html", "https://bj.lianjia.com/ershoufang/101105320359.html", "https://bj.lianjia.com/ershoufang/101104645675.html", "https://bj.lianjia.com/ershoufang/101105329813.html", "https://bj.lianjia.com/ershoufang/101105279486.html", "https://bj.lianjia.com/ershoufang/101104791137.html", "https://bj.lianjia.com/ershoufang/101104349401.html", "https://bj.lianjia.com/ershoufang/101105162851.html", "https://bj.lianjia.com/ershoufang/101104352142.html", "https://bj.lianjia.com/ershoufang/101105465856.html", "https://bj.lianjia.com/ershoufang/101104049181.html", "https://bj.lianjia.com/ershoufang/101105326853.html", "https://bj.lianjia.com/ershoufang/101105452686.html", "https://bj.lianjia.com/ershoufang/101105432369.html", "https://bj.lianjia.com/ershoufang/101105193502.html", "https://bj.lianjia.com/ershoufang/101104932338.html", "https://bj.lianjia.com/ershoufang/101105429916.html", "https://bj.lianjia.com/ershoufang/101105287275.html", "https://bj.lianjia.com/ershoufang/101105439062.html", "https://bj.lianjia.com/ershoufang/101104402834.html", "https://bj.lianjia.com/ershoufang/101105443249.html", "https://bj.lianjia.com/ershoufang/101104501599.html", "https://bj.lianjia.com/ershoufang/101105223490.html", "https://bj.lianjia.com/ershoufang/101105111764.html", "https://bj.lianjia.com/ershoufang/101105327653.html" +]}, +{ +"clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_data-log_index": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30"]}, +{ +"clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_data-housecode": ["101105458455", "101105296451", "101105456335", "101105080586", "101104595999", "101105153614", "101105320359", "101104645675", "101105329813", "101105279486", "101104791137", "101104349401", "101105162851", "101104352142", "101105465856", "101104049181", "101105326853", "101105452686", "101105432369", "101105193502", "101104932338", "101105429916", "101105287275", "101105439062", "101104402834", "101105443249", "101104501599", "101105223490", "101105111764", "101105327653"]}, +{ +"title_": [ +"\u59da\u5bb6\u4e95\u4e8c\u5df7 1\u5ba41\u5385 539\u4e07", "\u9ed1\u7a91\u5382\u897f\u91cc\u5357\u5317\u901a\u900f\u4f4e\u697c\u5c42\u4e24\u5c45\u5ba4", "\u7231\u6c11\u91cc \u6ee1\u4e94\u5e74 \u4e00\u5c45 \u968f\u65f6\u53ef\u770b", "\u6708\u5b63\u56ed 2\u5ba41\u5385 440\u4e07", "\u5168\u5357\u5411\u4e00\u5c45\u5ba4\uff0c\u660e\u53a8\u660e\u536b\uff0c\u4e2d\u95f4\u697c\u5c42\uff0c\u89c6\u91ce\u597d\uff0c\u91c7\u5149\u5145\u8db3", "\u901a\u5dde\u5317\u82d1\u5546\u5708 \u6ee1\u4e94\u552f\u4e00 \u5168\u660e\u6237\u578b \u770b\u623f\u65b9\u4fbf", "\u5546\u54c1\u623f\u6ee1\u4e94\u5e74\u552f\u4e00 \u5c0f\u533a\u4e2d\u95f4\u4f4d\u7f6e \u91c7\u5149\u597d", "\u683c\u5170\u6674\u5929 3\u5ba42\u5385 632\u4e07", "\u6c81\u6625\u5bb6\u56ed\u7cbe\u88c5\u4fee\u5357\u5317\u5411\u4e24\u5c45\u5ba4\u91c7\u5149\u65e0\u906e\u6321\u4e1a\u4e3b\u8bda\u5fc3\u51fa\u552e", "\u677f\u697c\u5357\u5411\u7cbe\u88c5\u4e24\u5c45\u5ba4 \u6237\u578b\u65b9\u6b63\u4f7f\u7528\u7387\u9ad8 \u770b\u623f\u65b9\u4fbf", "\u6a21\u5f0f\u53e3\u5317\u91cc 2\u5ba42\u5385 390\u4e07", "\u6ee1\u4e94\u5e74\u552f\u4e00\uff0c\u5357\u5317\u901a\u900f\u4e09\u5c45\u5ba4\uff0c\u5a5a\u623f\u88c5\u4fee", "\u4e09\u73af\u8fb9\u513f\u5bcc\u987f\u52b2\u677e\u5730\u94c1\u53e3\u4f4e\u603b\u4ef7\u5357\u5411\u5f00\u95f4\u8bda\u610f\u51fa\u552e", "\u677f\u697c\u76842\u5c42\uff0c\u5c0f\u533a\u5b89\u9759\u6574\u6d01\uff0c\u7a0e\u8d39\u5c11\uff0c\u968f\u65f6\u7b7e\u7ea6", "\u6ee1\u4e94\u5e74\u4e0d\u552f\u4e00\u516c\u623f\uff0c\u8bda\u5fc3\u51fa\u552e\uff0c\u6237\u578b\u65b9\u6b63\u3002", "\u4e16\u7eaa\u6751\u897f\u533a \u6ee1\u4e94\u552f\u4e00 \u7cbe\u88c5\u5357\u5317\u4e24\u5c45\u5ba4 \u4e2d\u95f4\u5c42", "\u56de\u9f99\u89c2\u7cbe\u88c5\u4fee\u7535\u68af\u4e24\u5c45\u4e24\u536b \u5357\u5317\u901a\u900f \u4e00\u68af\u4e24\u6237", "\u6ee1\u4e94\u552f\u4e00\u65e0\u4e2a\u7a0e\uff0c\u677f\u697c\u4e8c\u5c42\uff0c\u5357\u5317\u53cc\u901a\u900f", "\u5b89\u8d1e\u897f\u91cc\u4e00\u5c42\u4e24\u5c45\u5ba4\uff0c\u4e1a\u4e3b\u81ea\u4f4f\uff0c\u697c\u4e0b\u5730\u94c1\u5b89\u534e\u6865\u7ad9", "\u671d\u9633\u6377\u5ea7 \u6ee1\u4e94\u552f\u4e00 \u4e2d\u95f4\u697c\u5c42 \u7cbe\u88c5\u4fee", "\u7545\u67f3\u56ed\u89c4\u77e9\u4e24\u5c45\u5ba4 \u4e1c\u5317\u671d\u5411 \u7535\u68af\u76f4\u8fbe", "\u4e1c\u5411\u4e00\u5c45\u5ba4 \u6ee1\u4e94\u5e74\u552f\u4e00\u5546\u54c1\u623f \u7a0e\u8d39\u5c11", "\u897f\u57ce\u533a \u7ea2\u5c45\u659c\u8857 1997\u5e74\u6b63\u89c4\u4e00\u5c45\u5ba4 \u671d\u5357\u4e2d\u95f4\u697c\u5c42", "\u5357\u5317\u901a\u900f\u5168\u660e3\u5c45\uff0c\u5e26\u7535\u68af\uff0c\u5e26\u8f66\u4f4d\uff0c\u7cbe\u88c5\u4fee\uff01", "\u5bcc\u529b\u53c8\u4e00\u57ceC\u533a\uff0c\u5168\u5357\u5411\u4e24\u5c45\uff0c\u4e2d\u95f4\u5c42\uff0c\u62ce\u5305\u5373\u5165\u4f4f", "\u6c34\u7893\u5b50\u4e1c\u91cc\u5357\u5317\u56db\u5c45\u5ba4 \u660e\u53a8\u660e\u536b", "\u5317\u8857\u5bb6\u56ed\u7cbe\u88c5\u4e24\u5c45 \u5357\u5317\u901a\u900f \u6ee1\u4e94\u552f\u4e00 \u4e00\u68af\u4e24\u6237", "\u4e07\u5e74\u82b1\u57ce\u56db\u671f 1\u5ba41\u5385 450\u4e07", "\u5357\u5317\u901a\u900f\u4e24\u5c45\u5ba4\uff0c\u914d\u5408\u8f6c\u5546 \u4e2d\u95f4\u697c\u5c42 \u91c7\u5149\u89c6\u91ce\u597d", "\u677f\u697c\u7535\u68af\u623f\u4e2d\uff0c\u9ad8\u697c\u5c42\uff0c\u6237\u578b\u65b9\u6b63\uff0c\u5357\u5411\u5f00\u95f4" +]}, +{ +"title_yezhushuo_tagBlock": ["\u65b0\u4e0a", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u65b0\u4e0a", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350", "\u623f\u4e3b\u81ea\u8350"]}, +{ +"a": ["\u59da\u5bb6\u4e95\u4e8c\u5df7 ", "\u9ed1\u7a91\u5382\u897f\u91cc ", "\u7231\u6c11\u91cc\u5c0f\u533a ", "\u6708\u5b63\u56ed ", "\u516b\u91cc\u5e84\u5317\u91cc\u5c0f\u533a ", "\u897f\u9a6c\u5e84\u56ed ", "\u4e91\u8da3\u56ed\u4e00\u533a ", "\u683c\u5170\u6674\u5929 ", "\u6c81\u6625\u5bb6\u56ed ", "\u53cc\u6986\u6811\u5317\u91cc ", "\u6a21\u5f0f\u53e3\u5317\u91cc ", "\u5e78\u798f\u4e8c\u6751 ", "\u5bcc\u987f ", "\u56e2\u7ed3\u6e56\u4e2d\u8def\u5357\u4e00\u6761 ", "\u6c7d\u5357\u5c0f\u533a ", "\u4e16\u7eaa\u6751\u897f\u533a ", "\u4f70\u5609\u57ce ", "\u5929\u901a\u82d1\u4e1c\u4e8c\u533a ", "\u5b89\u8d1e\u897f\u91cc ", "\u671d\u9633\u6377\u5ea7 ", "\u7545\u67f3\u56ed ", "\u946b\u5146\u96c5\u56ed\u5317\u533a ", "\u7ea2\u5c45\u659c\u8857 ", "\u519c\u5149\u91cc ", "\u5bcc\u529b\u53c8\u4e00\u57ceC\u533a ", "\u6c34\u7893\u5b50\u4e1c\u91cc ", "\u5317\u8857\u5bb6\u56ed\u516d\u533a ", "\u4e07\u5e74\u82b1\u57ce\u56db\u671f ", "\u5929\u901a\u82d1\u4e1c\u4e8c\u533a ", "\u7d2b\u91d1\u65b0\u5e72\u7ebf "]}, +{ +"\u5c5e\u602710": [ +"\u59da\u5bb6\u4e95\u4e8c\u5df7 ", " | 1\u5ba41\u5385 | 35.52\u5e73\u7c73 | \u5357 | \u5176\u4ed6", "\u9ed1\u7a91\u5382\u897f\u91cc ", " | 2\u5ba41\u5385 | 48.21\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u7231\u6c11\u91cc\u5c0f\u533a ", " | 1\u5ba41\u5385 | 39.4\u5e73\u7c73 | \u5357 | \u7b80\u88c5", "\u6708\u5b63\u56ed ", " | 2\u5ba41\u5385 | 100.73\u5e73\u7c73 | \u4e1c \u5357 \u5317 | \u7b80\u88c5", "\u516b\u91cc\u5e84\u5317\u91cc\u5c0f\u533a ", " | 1\u5ba41\u5385 | 42.07\u5e73\u7c73 | \u5357 | \u7b80\u88c5", "\u897f\u9a6c\u5e84\u56ed ", " | 2\u5ba41\u5385 | 56.6\u5e73\u7c73 | \u4e1c \u897f | \u7cbe\u88c5", "\u4e91\u8da3\u56ed\u4e00\u533a ", " | 2\u5ba41\u5385 | 84.44\u5e73\u7c73 | \u5357 \u5317 | \u7b80\u88c5", "\u683c\u5170\u6674\u5929 ", " | 3\u5ba42\u5385 | 126.61\u5e73\u7c73 | \u4e1c\u5357 | \u7b80\u88c5", "\u6c81\u6625\u5bb6\u56ed ", " | 2\u5ba41\u5385 | 77.42\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u53cc\u6986\u6811\u5317\u91cc ", " | 2\u5ba41\u5385 | 50.1\u5e73\u7c73 | \u5357 | \u7cbe\u88c5", "\u6a21\u5f0f\u53e3\u5317\u91cc ", " | 2\u5ba42\u5385 | 101.54\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u5e78\u798f\u4e8c\u6751 ", " | 3\u5ba41\u5385 | 113.84\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u5bcc\u987f ", " | 1\u5ba40\u5385 | 38.83\u5e73\u7c73 | \u5357 | \u7b80\u88c5", "\u56e2\u7ed3\u6e56\u4e2d\u8def\u5357\u4e00\u6761 ", " | 2\u5ba41\u5385 | 60.8\u5e73\u7c73 | \u5357 \u5317 | \u7b80\u88c5", "\u6c7d\u5357\u5c0f\u533a ", " | 2\u5ba41\u5385 | 50.1\u5e73\u7c73 | \u5357 \u897f \u5317 | \u7b80\u88c5", "\u4e16\u7eaa\u6751\u897f\u533a ", " | 2\u5ba41\u5385 | 59.1\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u4f70\u5609\u57ce ", " | 2\u5ba41\u5385 | 93.77\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u5929\u901a\u82d1\u4e1c\u4e8c\u533a ", " | 2\u5ba41\u5385 | 111.9\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u5b89\u8d1e\u897f\u91cc ", " | 2\u5ba41\u5385 | 52.16\u5e73\u7c73 | \u4e1c \u897f | \u7cbe\u88c5", "\u671d\u9633\u6377\u5ea7 ", " | 2\u5ba41\u5385 | 83.16\u5e73\u7c73 | \u4e1c | \u7cbe\u88c5", "\u7545\u67f3\u56ed ", " | 2\u5ba41\u5385 | 64.97\u5e73\u7c73 | \u4e1c\u5317 | \u7b80\u88c5", "\u946b\u5146\u96c5\u56ed\u5317\u533a ", " | 1\u5ba40\u5385 | 36.44\u5e73\u7c73 | \u4e1c | \u7b80\u88c5", "\u7ea2\u5c45\u659c\u8857 ", " | 1\u5ba41\u5385 | 47.28\u5e73\u7c73 | \u5357 | \u7cbe\u88c5", "\u519c\u5149\u91cc ", " | 3\u5ba41\u5385 | 96.48\u5e73\u7c73 | \u4e1c \u5357 \u5317 | \u7cbe\u88c5", "\u5bcc\u529b\u53c8\u4e00\u57ceC\u533a ", " | 2\u5ba41\u5385 | 87.71\u5e73\u7c73 | \u5357 | \u7cbe\u88c5", "\u6c34\u7893\u5b50\u4e1c\u91cc ", " | 4\u5ba41\u5385 | 102.2\u5e73\u7c73 | \u5357 \u5317 | \u7b80\u88c5", "\u5317\u8857\u5bb6\u56ed\u516d\u533a ", " | 2\u5ba41\u5385 | 69.15\u5e73\u7c73 | \u5357 \u5317 | \u7b80\u88c5", "\u4e07\u5e74\u82b1\u57ce\u56db\u671f ", " | 1\u5ba41\u5385 | 71.11\u5e73\u7c73 | \u5317 | \u7cbe\u88c5", "\u5929\u901a\u82d1\u4e1c\u4e8c\u533a ", " | 2\u5ba41\u5385 | 91.71\u5e73\u7c73 | \u5357 \u5317 | \u7cbe\u88c5", "\u7d2b\u91d1\u65b0\u5e72\u7ebf ", " | 1\u5ba40\u5385 | 46.04\u5e73\u7c73 | \u5357 | \u7cbe\u88c5" +]}, +{ +"\u5c5e\u602711": [ +"\u9ad8\u697c\u5c42(\u51716\u5c42)1990\u5e74\u5efa\u677f\u697c - ", "\u9676\u7136\u4ead", "\u5e95\u5c42(\u51715\u5c42)1970\u5e74\u5efa\u677f\u697c - ", "\u9676\u7136\u4ead", "\u4e2d\u697c\u5c42(\u51716\u5c42)1992\u5e74\u5efa\u677f\u697c - ", "\u897f\u56db", "\u4f4e\u697c\u5c42(\u517118\u5c42)2009\u5e74\u5efa\u5854\u697c - ", "\u6b66\u5937\u82b1\u56ed", "\u4e2d\u697c\u5c42(\u51716\u5c42)1989\u5e74\u5efa\u677f\u697c - ", "\u77f3\u4f5b\u8425", "\u9876\u5c42(\u51716\u5c42)1998\u5e74\u5efa\u677f\u697c - ", "\u901a\u5dde\u5317\u82d1", "\u4e2d\u697c\u5c42(\u51717\u5c42)2000\u5e74\u5efa\u677f\u697c - ", "\u56de\u9f99\u89c2", "\u9ad8\u697c\u5c42(\u51719\u5c42)2004\u5e74\u5efa\u677f\u5854\u7ed3\u5408 - ", "\u7389\u6865", "\u4f4e\u697c\u5c42(\u51717\u5c42)2000\u5e74\u5efa\u677f\u697c - ", "\u897f\u4e09\u65d7", "\u5e95\u5c42(\u51716\u5c42)1985\u5e74\u5efa\u677f\u697c - ", "\u53cc\u6986\u6811", "\u9876\u5c42(\u51713\u5c42)1993\u5e74\u5efa\u677f\u697c - ", "\u82f9\u679c\u56ed", "\u9ad8\u697c\u5c42(\u51716\u5c42)1999\u5e74\u5efa\u677f\u697c - ", "\u5de5\u4f53", "\u4f4e\u697c\u5c42(\u517123\u5c42)2003\u5e74\u5efa\u5854\u697c - ", "\u52b2\u677e", "\u4f4e\u697c\u5c42(\u51716\u5c42)1980\u5e74\u5efa\u677f\u697c - ", "\u56e2\u7ed3\u6e56", "\u5e95\u5c42(\u51715\u5c42)1970\u5e74\u5efa\u677f\u697c - ", "\u6728\u6a28\u5730", "\u4e2d\u697c\u5c42(\u51717\u5c42)1994\u5e74\u5efa\u677f\u697c - ", "\u4e9a\u8fd0\u6751\u5c0f\u8425", "\u9876\u5c42(\u517110\u5c42)2006\u5e74\u5efa\u677f\u697c - ", "\u56de\u9f99\u89c2", "\u4f4e\u697c\u5c42(\u51717\u5c42)2001\u5e74\u5efa\u677f\u697c - ", "\u5929\u901a\u82d1", "\u5e95\u5c42(\u51716\u5c42)1987\u5e74\u5efa\u677f\u697c - ", "\u5b89\u8d1e", "\u4e2d\u697c\u5c42(\u517118\u5c42)2005\u5e74\u5efa\u5854\u697c - ", "\u77f3\u4f5b\u8425", "\u4e2d\u697c\u5c42(\u517116\u5c42)2003\u5e74\u5efa\u677f\u5854\u7ed3\u5408 - ", "\u9676\u7136\u4ead", "\u4e2d\u697c\u5c42(\u517120\u5c42)2004\u5e74\u5efa\u677f\u5854\u7ed3\u5408 - ", "\u5218\u5bb6\u7a91", "\u4e2d\u697c\u5c42(\u51716\u5c42)1997\u5e74\u5efa\u677f\u697c - ", "\u5e7f\u5b89\u95e8", "\u4e2d\u697c\u5c42(\u517123\u5c42)1996\u5e74\u5efa\u5854\u697c - ", "\u52b2\u677e", "\u4f4e\u697c\u5c42(\u517123\u5c42)2011\u5e74\u5efa\u5854\u697c - ", "\u8c46\u5404\u5e84", "\u4f4e\u697c\u5c42(\u51716\u5c42)1984\u5e74\u5efa\u677f\u697c - ", "\u751c\u6c34\u56ed", "\u4f4e\u697c\u5c42(\u517118\u5c42)2010\u5e74\u5efa\u677f\u697c - ", "\u6c99\u6cb3", "\u9ad8\u697c\u5c42(\u517127\u5c42)2008\u5e74\u5efa\u5854\u697c - ", "\u7389\u6cc9\u8425", "\u4e2d\u697c\u5c42(\u51717\u5c42)2001\u5e74\u5efa\u677f\u697c - ", "\u5929\u901a\u82d1", "\u9ad8\u697c\u5c42(\u517110\u5c42)2009\u5e74\u5efa\u677f\u697c - ", "\u970d\u8425" +]}, +{ +"\u5c5e\u602712": ["\u9676\u7136\u4ead", "\u9676\u7136\u4ead", "\u897f\u56db", "\u6b66\u5937\u82b1\u56ed", "\u77f3\u4f5b\u8425", "\u901a\u5dde\u5317\u82d1", "\u56de\u9f99\u89c2", "\u7389\u6865", "\u897f\u4e09\u65d7", "\u53cc\u6986\u6811", "\u82f9\u679c\u56ed", "\u5de5\u4f53", "\u52b2\u677e", "\u56e2\u7ed3\u6e56", "\u6728\u6a28\u5730", "\u4e9a\u8fd0\u6751\u5c0f\u8425", "\u56de\u9f99\u89c2", "\u5929\u901a\u82d1", "\u5b89\u8d1e", "\u77f3\u4f5b\u8425", "\u9676\u7136\u4ead", "\u5218\u5bb6\u7a91", "\u5e7f\u5b89\u95e8", "\u52b2\u677e", "\u8c46\u5404\u5e84", "\u751c\u6c34\u56ed", "\u6c99\u6cb3", "\u7389\u6cc9\u8425", "\u5929\u901a\u82d1", "\u970d\u8425"]}, +{ +"\u5c5e\u602713": [ +"35\u4eba\u5173\u6ce8 / 6\u5929\u4ee5\u524d\u53d1\u5e03", "34\u4eba\u5173\u6ce8 / 26\u5929\u4ee5\u524d\u53d1\u5e03", "29\u4eba\u5173\u6ce8 / 7\u5929\u4ee5\u524d\u53d1\u5e03", "52\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "341\u4eba\u5173\u6ce8 / 3\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "105\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "151\u4eba\u5173\u6ce8 / 23\u5929\u4ee5\u524d\u53d1\u5e03", "44\u4eba\u5173\u6ce8 / 3\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "62\u4eba\u5173\u6ce8 / 22\u5929\u4ee5\u524d\u53d1\u5e03", "30\u4eba\u5173\u6ce8 / 28\u5929\u4ee5\u524d\u53d1\u5e03", "121\u4eba\u5173\u6ce8 / 2\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "106\u4eba\u5173\u6ce8 / 4\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "208\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "58\u4eba\u5173\u6ce8 / 4\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "9\u4eba\u5173\u6ce8 / 5\u5929\u4ee5\u524d\u53d1\u5e03", "20\u4eba\u5173\u6ce8 / 6\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "70\u4eba\u5173\u6ce8 / 22\u5929\u4ee5\u524d\u53d1\u5e03", "21\u4eba\u5173\u6ce8 / 7\u5929\u4ee5\u524d\u53d1\u5e03", "27\u4eba\u5173\u6ce8 / 9\u5929\u4ee5\u524d\u53d1\u5e03", "84\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "27\u4eba\u5173\u6ce8 / 2\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "24\u4eba\u5173\u6ce8 / 10\u5929\u4ee5\u524d\u53d1\u5e03", "67\u4eba\u5173\u6ce8 / 27\u5929\u4ee5\u524d\u53d1\u5e03", "103\u4eba\u5173\u6ce8 / 9\u5929\u4ee5\u524d\u53d1\u5e03", "56\u4eba\u5173\u6ce8 / 4\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "44\u4eba\u5173\u6ce8 / 8\u5929\u4ee5\u524d\u53d1\u5e03", "141\u4eba\u5173\u6ce8 / 4\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "9\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "39\u4eba\u5173\u6ce8 / 1\u4e2a\u6708\u4ee5\u524d\u53d1\u5e03", "42\u4eba\u5173\u6ce8 / 22\u5929\u4ee5\u524d\u53d1\u5e03" +]}, +{ +"tag_good_class": ["good", "good", "good", "vr", "good", "vr", "good", "good", "good", "good", "vr", "vr", "good", "good", "good", "good", "vr", "vr", "good", "vr", "good", "good", "good", "good", "vr", "good", "taxfree", "good", "vr", "vr"]}, +{ +"tag_haskey": ["VR\u623f\u6e90", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e94\u5e74", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e94\u5e74", "VR\u623f\u6e90", "VR\u623f\u6e90", "VR\u623f\u6e90", "VR\u623f\u6e90", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "VR\u623f\u6e90", "VR\u623f\u6e90", "\u968f\u65f6\u770b\u623f", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e94\u5e74", "VR\u623f\u6e90", "VR\u623f\u6e90", "VR\u623f\u6e90", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e24\u5e74", "VR\u623f\u6e90", "VR\u623f\u6e90", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74"]}, +{ +"tag_taxfree": ["\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e24\u5e74", "\u623f\u672c\u6ee1\u4e24\u5e74", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u968f\u65f6\u770b\u623f", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e94\u5e74", "\u623f\u672c\u6ee1\u4e24\u5e74", "\u623f\u672c\u6ee1\u4e24\u5e74", "\u968f\u65f6\u770b\u623f", "\u968f\u65f6\u770b\u623f"]}, +{ +"span": ["539", "700", "669", "440", "276", "225", "405", "632", "475", "540", "390", "870", "258", "430", "726", "445", "395", "425", "420", "531", "630", "275", "480", "560", "480", "710", "333", "450", "388", "319"]}, +{ +"\u5c5e\u602718": [ +"\u5355\u4ef7151746\u5143/\u5e73\u7c73", "\u5355\u4ef7145199\u5143/\u5e73\u7c73", "\u5355\u4ef7169797\u5143/\u5e73\u7c73", "\u5355\u4ef743682\u5143/\u5e73\u7c73", "\u5355\u4ef765605\u5143/\u5e73\u7c73", "\u5355\u4ef739753\u5143/\u5e73\u7c73", "\u5355\u4ef747964\u5143/\u5e73\u7c73", "\u5355\u4ef749918\u5143/\u5e73\u7c73", "\u5355\u4ef761354\u5143/\u5e73\u7c73", "\u5355\u4ef7107785\u5143/\u5e73\u7c73", "\u5355\u4ef738409\u5143/\u5e73\u7c73", "\u5355\u4ef776424\u5143/\u5e73\u7c73", "\u5355\u4ef766444\u5143/\u5e73\u7c73", "\u5355\u4ef770724\u5143/\u5e73\u7c73", "\u5355\u4ef7144911\u5143/\u5e73\u7c73", "\u5355\u4ef775297\u5143/\u5e73\u7c73", "\u5355\u4ef742125\u5143/\u5e73\u7c73", "\u5355\u4ef737981\u5143/\u5e73\u7c73", "\u5355\u4ef780522\u5143/\u5e73\u7c73", "\u5355\u4ef763853\u5143/\u5e73\u7c73", "\u5355\u4ef796968\u5143/\u5e73\u7c73", "\u5355\u4ef775467\u5143/\u5e73\u7c73", "\u5355\u4ef7101523\u5143/\u5e73\u7c73", "\u5355\u4ef758044\u5143/\u5e73\u7c73", "\u5355\u4ef754726\u5143/\u5e73\u7c73", "\u5355\u4ef769472\u5143/\u5e73\u7c73", "\u5355\u4ef748157\u5143/\u5e73\u7c73", "\u5355\u4ef763283\u5143/\u5e73\u7c73", "\u5355\u4ef742308\u5143/\u5e73\u7c73", "\u5355\u4ef769288\u5143/\u5e73\u7c73" +]}, +{ +"sellListContent_clear_LOGVIEWDATA_LOGCLICKDATA_data-lj_action_click_position": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"]}, +{ +"sellListContent_clear_LOGVIEWDATA_LOGCLICKDATA_data-lj_action_resblock_id": ["1111027381788", "1111027375434", "1111027375640", "1111027381831", "1111027376016", "1111027381072", "1111027381985", "1111027374633", "1111027378786", "1111027379678", "1111027378397", "1111027380701", "1111027374187", "1111046810631", "1111027378900", "1111027379323", "1111027375875", "1111027380049", "1111027375686", "1111027376659", "1111027376481", "1111027381482", "1111027375103", "1111027378492", "1111027374303", "1111027379192", "1111027379515", "1111027380388", "1111027380049", "1111027382546"] +}, diff --git a/outFile/__init__.py b/outFile/__init__.py new file mode 100644 index 0000000..6691cb3 --- /dev/null +++ b/outFile/__init__.py @@ -0,0 +1,6 @@ +# -*- encoding: utf-8 -*- +""" +@File : __init__.py +@Time : 19/8/2019 18:45 +@Author : liyang +""" diff --git a/outFile/out.json b/outFile/out.json new file mode 100644 index 0000000..8e14d8c --- /dev/null +++ b/outFile/out.json @@ -0,0 +1,743 @@ +[ + { + 'noresultRecommend_img_LOGCLICKDATA_vr_item_src': [ + 'https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686', + 'https://s1.ljcdn.com/feroot/pc/asset/img/vr/vrlogo.png?_v=20190814143630686' + ] + }, + { + 'noresultRecommend_img_LOGCLICKDATA_vr_item_class': [ + 'lj-lazy', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'lj-lazy', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'vr_item', + 'lj-lazy', + 'vr_item', + 'vr_item', + 'vr_item' + ] + }, + { + 'noresultRecommend_img_LOGCLICKDATA_lj-lazy_data-original': [ + 'https://image1.ljcdn.com/110000-inspection/prod-4b341336-be62-4db2-b57e-5a26102ed699.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-eddc4c04-031c-43b8-b9aa-57a79883db5f.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-f2a3eb14-e5b3-4b85-8eb6-fc72407fc573.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-387d1411-3bb7-4a05-9c21-ccc2683bfc96.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-eb4806bd-bf20-4765-8aa6-01b3976ae697.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-692b3dd4-3dff-4517-8ae9-6084efe2b049.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-2a920848-c463-4a9c-8a7e-7b89221c5106.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-6c1e1381-f1d3-44c6-b257-b881c348f268.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-d704117b-f678-4f91-839c-b9adb8b6c643.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-c8a7ccef-84b4-42cd-a11f-2be5886f88e3.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-6a44d10e-9cdf-4f60-a0ff-d522cbc45f03.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-47caa008-a052-4d8a-85f9-58edaad3eea4.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-1154fd94-dd2b-4da1-899b-988a8fe6c35a.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/test-b5bbeefd-2fb4-452d-a752-ea015732e709.png.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-dcb1b6d0-629c-482e-ad63-cd51f0bbdd58.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-69ba93a0-d55c-4ee6-b125-04bf03bf2ba4.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-484fed7b-1801-4640-8c4d-94cba67e52e4.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-9582f142-2e3f-4282-95db-e100fe1d4cf9.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-024fd6b8-6760-4f75-9bf3-189f152a66b7.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-756ec8d2-ba26-4dae-abb9-756fa7dc1db3.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-4a5f973f-ea2f-40d7-bf88-605f7f578fed.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-ef191c81-16ec-42eb-a9d8-1b303d692fc8.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-2234ef15-0ab2-4dad-81da-96dc09f9f791.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-2346212e-061c-4510-85aa-9b37b68ad608.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-8703d387-e7b7-4d5a-9fbf-611374cb4ac1.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-e13379b0-06c6-4399-9d56-f76307508882.jpg.296x216.jpg', + 'https://image1.ljcdn.com/110000-inspection/prod-dc5419e3-ce76-46d6-a7e0-8316710edbfc.jpg.296x216.jpg' + ] + }, + { + 'noresultRecommend_img_LOGCLICKDATA_lj-lazy_alt': [ + '北京西城陶然亭', + '北京西城西四', + '北京通州武夷花园', + '北京朝阳石佛营', + '北京通州通州北苑', + '北京昌平回龙观', + '北京通州玉桥', + '北京海淀西三旗', + '北京海淀双榆树', + '北京石景山苹果园', + '北京朝阳工体', + '北京朝阳劲松', + '北京朝阳团结湖', + '北京朝阳亚运村小营', + '北京昌平回龙观', + '北京昌平天通苑', + '北京朝阳安贞', + '北京朝阳石佛营', + '北京西城陶然亭', + '北京丰台刘家窑', + '北京西城广安门', + '北京朝阳劲松', + '北京朝阳豆各庄', + '北京朝阳甜水园', + '北京丰台玉泉营', + '北京昌平天通苑', + '北京昌平霍营' + ] + }, + { + 'clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_href': [ + 'https://bj.lianjia.com/ershoufang/101105458455.html', + 'https://bj.lianjia.com/ershoufang/101105296451.html', + 'https://bj.lianjia.com/ershoufang/101105456335.html', + 'https://bj.lianjia.com/ershoufang/101105080586.html', + 'https://bj.lianjia.com/ershoufang/101104595999.html', + 'https://bj.lianjia.com/ershoufang/101105153614.html', + 'https://bj.lianjia.com/ershoufang/101105320359.html', + 'https://bj.lianjia.com/ershoufang/101104645675.html', + 'https://bj.lianjia.com/ershoufang/101105329813.html', + 'https://bj.lianjia.com/ershoufang/101105279486.html', + 'https://bj.lianjia.com/ershoufang/101104791137.html', + 'https://bj.lianjia.com/ershoufang/101104349401.html', + 'https://bj.lianjia.com/ershoufang/101105162851.html', + 'https://bj.lianjia.com/ershoufang/101104352142.html', + 'https://bj.lianjia.com/ershoufang/101105465856.html', + 'https://bj.lianjia.com/ershoufang/101104049181.html', + 'https://bj.lianjia.com/ershoufang/101105326853.html', + 'https://bj.lianjia.com/ershoufang/101105452686.html', + 'https://bj.lianjia.com/ershoufang/101105432369.html', + 'https://bj.lianjia.com/ershoufang/101105193502.html', + 'https://bj.lianjia.com/ershoufang/101104932338.html', + 'https://bj.lianjia.com/ershoufang/101105429916.html', + 'https://bj.lianjia.com/ershoufang/101105287275.html', + 'https://bj.lianjia.com/ershoufang/101105439062.html', + 'https://bj.lianjia.com/ershoufang/101104402834.html', + 'https://bj.lianjia.com/ershoufang/101105443249.html', + 'https://bj.lianjia.com/ershoufang/101104501599.html', + 'https://bj.lianjia.com/ershoufang/101105223490.html', + 'https://bj.lianjia.com/ershoufang/101105111764.html', + 'https://bj.lianjia.com/ershoufang/101105327653.html' + ] + }, + { + 'clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_data-log_index': [ + '1', + '2', + '3', + '4', + '5', + '6', + '7', + '8', + '9', + '10', + '11', + '12', + '13', + '14', + '15', + '16', + '17', + '18', + '19', + '20', + '21', + '22', + '23', + '24', + '25', + '26', + '27', + '28', + '29', + '30' + ] + }, + { + 'clear_LOGVIEWDATA_LOGCLICKDATA_noresultRecommend_img_LOGCLICKDATA_data-housecode': [ + '101105458455', + '101105296451', + '101105456335', + '101105080586', + '101104595999', + '101105153614', + '101105320359', + '101104645675', + '101105329813', + '101105279486', + '101104791137', + '101104349401', + '101105162851', + '101104352142', + '101105465856', + '101104049181', + '101105326853', + '101105452686', + '101105432369', + '101105193502', + '101104932338', + '101105429916', + '101105287275', + '101105439062', + '101104402834', + '101105443249', + '101104501599', + '101105223490', + '101105111764', + '101105327653' + ] + }, + { + 'title_': [ + '姚家井二巷 1室1厅 539万', + '黑窑厂西里南北通透低楼层两居室', + '爱民里 满五年 一居 随时可看', + '月季园 2室1厅 440万', + '全南向一居室,明厨明卫,中间楼层,视野好,采光充足', + '通州北苑商圈 满五唯一 全明户型 看房方便', + '商品房满五年唯一 小区中间位置 采光好', + '格兰晴天 3室2厅 632万', + '沁春家园精装修南北向两居室采光无遮挡业主诚心出售', + '板楼南向精装两居室 户型方正使用率高 看房方便', + '模式口北里 2室2厅 390万', + '满五年唯一,南北通透三居室,婚房装修', + '三环边儿富顿劲松地铁口低总价南向开间诚意出售', + '板楼的2层,小区安静整洁,税费少,随时签约', + '满五年不唯一公房,诚心出售,户型方正。', + '世纪村西区 满五唯一 精装南北两居室 中间层', + '回龙观精装修电梯两居两卫 南北通透 一梯两户', + '满五唯一无个税,板楼二层,南北双通透', + '安贞西里一层两居室,业主自住,楼下地铁安华桥站', + '朝阳捷座 满五唯一 中间楼层 精装修', + '畅柳园规矩两居室 东北朝向 电梯直达', + '东向一居室 满五年唯一商品房 税费少', + '西城区 红居斜街 1997年正规一居室 朝南中间楼层', + '南北通透全明3居,带电梯,带车位,精装修!', + '富力又一城C区,全南向两居,中间层,拎包即入住', + '水碓子东里南北四居室 明厨明卫', + '北街家园精装两居 南北通透 满五唯一 一梯两户', + '万年花城四期 1室1厅 450万', + '南北通透两居室,配合转商 中间楼层 采光视野好', + '板楼电梯房中,高楼层,户型方正,南向开间' + ] + }, + { + 'title_yezhushuo_tagBlock': [ + '新上', + '新上', + '房主自荐', + '房主自荐', + '房主自荐', + '房主自荐', + '房主自荐', + '房主自荐', + '新上', + '房主自荐', + '房主自荐', + '房主自荐' + ] + }, + { + 'a': [ + '姚家井二巷 ', + '黑窑厂西里 ', + '爱民里小区 ', + '月季园 ', + '八里庄北里小区 ', + '西马庄园 ', + '云趣园一区 ', + '格兰晴天 ', + '沁春家园 ', + '双榆树北里 ', + '模式口北里 ', + '幸福二村 ', + '富顿 ', + '团结湖中路南一条 ', + '汽南小区 ', + '世纪村西区 ', + '佰嘉城 ', + '天通苑东二区 ', + '安贞西里 ', + '朝阳捷座 ', + '畅柳园 ', + '鑫兆雅园北区 ', + '红居斜街 ', + '农光里 ', + '富力又一城C区 ', + '水碓子东里 ', + '北街家园六区 ', + '万年花城四期 ', + '天通苑东二区 ', + '紫金新干线 ' + ] + }, + { + '属性10': [ + '姚家井二巷 ', + ' | 1室1厅 | 35.52平米 | 南 | 其他', + '黑窑厂西里 ', + ' | 2室1厅 | 48.21平米 | 南 北 | 精装', + '爱民里小区 ', + ' | 1室1厅 | 39.4平米 | 南 | 简装', + '月季园 ', + ' | 2室1厅 | 100.73平米 | 东 南 北 | 简装', + '八里庄北里小区 ', + ' | 1室1厅 | 42.07平米 | 南 | 简装', + '西马庄园 ', + ' | 2室1厅 | 56.6平米 | 东 西 | 精装', + '云趣园一区 ', + ' | 2室1厅 | 84.44平米 | 南 北 | 简装', + '格兰晴天 ', + ' | 3室2厅 | 126.61平米 | 东南 | 简装', + '沁春家园 ', + ' | 2室1厅 | 77.42平米 | 南 北 | 精装', + '双榆树北里 ', + ' | 2室1厅 | 50.1平米 | 南 | 精装', + '模式口北里 ', + ' | 2室2厅 | 101.54平米 | 南 北 | 精装', + '幸福二村 ', + ' | 3室1厅 | 113.84平米 | 南 北 | 精装', + '富顿 ', + ' | 1室0厅 | 38.83平米 | 南 | 简装', + '团结湖中路南一条 ', + ' | 2室1厅 | 60.8平米 | 南 北 | 简装', + '汽南小区 ', + ' | 2室1厅 | 50.1平米 | 南 西 北 | 简装', + '世纪村西区 ', + ' | 2室1厅 | 59.1平米 | 南 北 | 精装', + '佰嘉城 ', + ' | 2室1厅 | 93.77平米 | 南 北 | 精装', + '天通苑东二区 ', + ' | 2室1厅 | 111.9平米 | 南 北 | 精装', + '安贞西里 ', + ' | 2室1厅 | 52.16平米 | 东 西 | 精装', + '朝阳捷座 ', + ' | 2室1厅 | 83.16平米 | 东 | 精装', + '畅柳园 ', + ' | 2室1厅 | 64.97平米 | 东北 | 简装', + '鑫兆雅园北区 ', + ' | 1室0厅 | 36.44平米 | 东 | 简装', + '红居斜街 ', + ' | 1室1厅 | 47.28平米 | 南 | 精装', + '农光里 ', + ' | 3室1厅 | 96.48平米 | 东 南 北 | 精装', + '富力又一城C区 ', + ' | 2室1厅 | 87.71平米 | 南 | 精装', + '水碓子东里 ', + ' | 4室1厅 | 102.2平米 | 南 北 | 简装', + '北街家园六区 ', + ' | 2室1厅 | 69.15平米 | 南 北 | 简装', + '万年花城四期 ', + ' | 1室1厅 | 71.11平米 | 北 | 精装', + '天通苑东二区 ', + ' | 2室1厅 | 91.71平米 | 南 北 | 精装', + '紫金新干线 ', + ' | 1室0厅 | 46.04平米 | 南 | 精装' + ] + }, + { + '属性11': [ + '高楼层(共6层)1990年建板楼 - ', + '陶然亭', + '底层(共5层)1970年建板楼 - ', + '陶然亭', + '中楼层(共6层)1992年建板楼 - ', + '西四', + '低楼层(共18层)2009年建塔楼 - ', + '武夷花园', + '中楼层(共6层)1989年建板楼 - ', + '石佛营', + '顶层(共6层)1998年建板楼 - ', + '通州北苑', + '中楼层(共7层)2000年建板楼 - ', + '回龙观', + '高楼层(共9层)2004年建板塔结合 - ', + '玉桥', + '低楼层(共7层)2000年建板楼 - ', + '西三旗', + '底层(共6层)1985年建板楼 - ', + '双榆树', + '顶层(共3层)1993年建板楼 - ', + '苹果园', + '高楼层(共6层)1999年建板楼 - ', + '工体', + '低楼层(共23层)2003年建塔楼 - ', + '劲松', + '低楼层(共6层)1980年建板楼 - ', + '团结湖', + '底层(共5层)1970年建板楼 - ', + '木樨地', + '中楼层(共7层)1994年建板楼 - ', + '亚运村小营', + '顶层(共10层)2006年建板楼 - ', + '回龙观', + '低楼层(共7层)2001年建板楼 - ', + '天通苑', + '底层(共6层)1987年建板楼 - ', + '安贞', + '中楼层(共18层)2005年建塔楼 - ', + '石佛营', + '中楼层(共16层)2003年建板塔结合 - ', + '陶然亭', + '中楼层(共20层)2004年建板塔结合 - ', + '刘家窑', + '中楼层(共6层)1997年建板楼 - ', + '广安门', + '中楼层(共23层)1996年建塔楼 - ', + '劲松', + '低楼层(共23层)2011年建塔楼 - ', + '豆各庄', + '低楼层(共6层)1984年建板楼 - ', + '甜水园', + '低楼层(共18层)2010年建板楼 - ', + '沙河', + '高楼层(共27层)2008年建塔楼 - ', + '玉泉营', + '中楼层(共7层)2001年建板楼 - ', + '天通苑', + '高楼层(共10层)2009年建板楼 - ', + '霍营' + ] + }, + { + '属性12': [ + '陶然亭', + '陶然亭', + '西四', + '武夷花园', + '石佛营', + '通州北苑', + '回龙观', + '玉桥', + '西三旗', + '双榆树', + '苹果园', + '工体', + '劲松', + '团结湖', + '木樨地', + '亚运村小营', + '回龙观', + '天通苑', + '安贞', + '石佛营', + '陶然亭', + '刘家窑', + '广安门', + '劲松', + '豆各庄', + '甜水园', + '沙河', + '玉泉营', + '天通苑', + '霍营' + ] + }, + { + '属性13': [ + '34人关注 / 6天以前发布', + '33人关注 / 25天以前发布', + '27人关注 / 6天以前发布', + '48人关注 / 1个月以前发布', + '330人关注 / 3个月以前发布', + '91人关注 / 1个月以前发布', + '147人关注 / 22天以前发布', + '41人关注 / 3个月以前发布', + '59人关注 / 21天以前发布', + '28人关注 / 28天以前发布', + '107人关注 / 2个月以前发布', + '101人关注 / 4个月以前发布', + '196人关注 / 1个月以前发布', + '54人关注 / 4个月以前发布', + '9人关注 / 5天以前发布', + '17人关注 / 6个月以前发布', + '65人关注 / 22天以前发布', + '21人关注 / 7天以前发布', + '25人关注 / 9天以前发布', + '77人关注 / 1个月以前发布', + '27人关注 / 2个月以前发布', + '21人关注 / 9天以前发布', + '62人关注 / 27天以前发布', + '94人关注 / 8天以前发布', + '52人关注 / 4个月以前发布', + '41人关注 / 8天以前发布', + '139人关注 / 4个月以前发布', + '8人关注 / 1个月以前发布', + '37人关注 / 1个月以前发布', + '39人关注 / 22天以前发布' + ] + }, + { + 'tag_good_class': [ + 'good', + 'good', + 'good', + 'vr', + 'good', + 'vr', + 'good', + 'good', + 'good', + 'good', + 'vr', + 'vr', + 'good', + 'good', + 'good', + 'good', + 'vr', + 'vr', + 'good', + 'vr', + 'good', + 'good', + 'good', + 'good', + 'vr', + 'good', + 'taxfree', + 'good', + 'vr', + 'vr' + ] + }, + { + 'tag_haskey': [ + 'VR房源', + 'VR房源', + '房本满五年', + 'VR房源', + '房本满五年', + 'VR房源', + 'VR房源', + 'VR房源', + 'VR房源', + '随时看房', + '房本满五年', + 'VR房源', + 'VR房源', + '随时看房', + 'VR房源', + '房本满五年', + '房本满五年', + 'VR房源', + '房本满五年', + 'VR房源', + 'VR房源', + 'VR房源', + 'VR房源', + '房本满两年', + 'VR房源', + 'VR房源', + '房本满五年', + '房本满五年' + ] + }, + { + 'tag_taxfree': [ + '房本满五年', + '房本满五年', + '随时看房', + '房本满五年', + '随时看房', + '房本满五年', + '房本满五年', + '房本满两年', + '房本满两年', + '随时看房', + '房本满五年', + '房本满五年', + '房本满五年', + '随时看房', + '房本满五年', + '随时看房', + '房本满五年', + '房本满五年', + '房本满五年', + '房本满两年', + '房本满两年', + '随时看房', + '随时看房' + ] + }, + { + 'span': [ + '539', + '700', + '669', + '440', + '276', + '225', + '405', + '632', + '475', + '540', + '390', + '870', + '258', + '430', + '726', + '445', + '395', + '425', + '420', + '531', + '630', + '275', + '480', + '560', + '480', + '710', + '333', + '450', + '388', + '319' + ] + }, + { + '属性18': [ + '单价151746元/平米', + '单价145199元/平米', + '单价169797元/平米', + '单价43682元/平米', + '单价65605元/平米', + '单价39753元/平米', + '单价47964元/平米', + '单价49918元/平米', + '单价61354元/平米', + '单价107785元/平米', + '单价38409元/平米', + '单价76424元/平米', + '单价66444元/平米', + '单价70724元/平米', + '单价144911元/平米', + '单价75297元/平米', + '单价42125元/平米', + '单价37981元/平米', + '单价80522元/平米', + '单价63853元/平米', + '单价96968元/平米', + '单价75467元/平米', + '单价101523元/平米', + '单价58044元/平米', + '单价54726元/平米', + '单价69472元/平米', + '单价48157元/平米', + '单价63283元/平米', + '单价42308元/平米', + '单价69288元/平米' + ] + }, + { + 'sellListContent_clear_LOGVIEWDATA_LOGCLICKDATA_data-lj_action_click_position': [ + '0', + '1', + '2', + '3', + '4', + '5', + '6', + '7', + '8', + '9', + '10', + '11', + '12', + '13', + '14', + '15', + '16', + '17', + '18', + '19', + '20', + '21', + '22', + '23', + '24', + '25', + '26', + '27', + '28', + '29' + ] + }, + { + 'sellListContent_clear_LOGVIEWDATA_LOGCLICKDATA_data-lj_action_resblock_id': [ + '1111027381788', + '1111027375434', + '1111027375640', + '1111027381831', + '1111027376016', + '1111027381072', + '1111027381985', + '1111027374633', + '1111027378786', + '1111027379678', + '1111027378397', + '1111027380701', + '1111027374187', + '1111046810631', + '1111027378900', + '1111027379323', + '1111027375875', + '1111027380049', + '1111027375686', + '1111027376659', + '1111027376481', + '1111027381482', + '1111027375103', + '1111027378492', + '1111027374303', + '1111027379192', + '1111027379515', + '1111027380388', + '1111027380049', + '1111027382546' + ] + } +] \ No newline at end of file diff --git a/project.xml b/project.xml deleted file mode 100644 index b693c76..0000000 --- a/project.xml +++ /dev/null @@ -1,105 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/sample.py b/sample.py deleted file mode 100644 index a887c57..0000000 --- a/sample.py +++ /dev/null @@ -1,20 +0,0 @@ -import etl; - -import extends -import time; -path='/home/desert.zym/dev' - -proj=etl.Project_LoadXml(path+'/Hawk-Projects/图片抓取/昵图网.xml'); -lagou=proj.modules['昵图网']; -tools= lagou.AllETLTools; -tools[-12].Format="/cloud/usr/desert.zym/picture/昵图网/{1}/{0}.jpg"; -tools[-1].Enabled=False; -tools[-9].Enabled=False; -#for r in lagou.QueryDatas(etlCount=19,execute=False): -# print(r) -# print(r) -from distributed import * -master =Master(proj,"昵图网"); -master.start(); - - diff --git a/spider.py b/spider.py deleted file mode 100644 index b850318..0000000 --- a/spider.py +++ /dev/null @@ -1,334 +0,0 @@ -# coding=utf-8 -import gzip -import re -import socket -import urllib.request -from lxml import etree -from urllib.parse import urlparse,urlunparse -import extends; -import http.cookiejar -from urllib.request import quote - -boxRegex = re.compile(r"\[\d{1,3}\]"); - - -class CrawItem(extends.EObject): - def __init__(self, name=None, sample=None, ismust=False, isHTMLorText=True, xpath=None): - self.XPath = xpath; - self.Sample = sample; - self.Name = name; - self.IsMust = ismust; - self.IsHTMLorText = isHTMLorText; - self.Children = []; - - def __str__(self): - return "%s %s %s" % (self.Name, self.XPath, self.Sample); - - -def RemoveFinalNum(paths): - v = paths[-1]; - m = boxRegex.search(v); - if m is None: - return paths; - s = m.group(0); - paths[-1] = v.replace(s, ""); - return paths; - - -def GetMaxCompareXPath(items): - xpaths = [r.XPath.split('/') for r in items]; - minlen = min(len(r) for r in xpaths); - c = None; - for i in range(minlen): - for index in range(len(xpaths)): - path = xpaths[index]; - if index == 0: - c = path[i]; - elif c != path[i]: - first = path[0:i + 1]; - return '/'.join(RemoveFinalNum(first)); - - -attrsplit=re.compile('@|\['); - -def GetDataFromXPath(node, path): - p = node.xpath(path); - if p is None: - return None; - if len(p) == 0: - return None; - paths = path.split('/'); - last = paths[-1]; - if last.find('@')>=0 and last.find('[1]')>=0: - return p[0]; - return getnodetext(p[0]); - - - - - - - - -def GetImage(addr, fname): - u = urllib.urlopen(addr) - data = u.read() - f = open(fname, 'wb') - f.write(data) - f.close() - - -def urlEncodeNonAscii(b): - return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) - -def iriToUri(iri): - parts= urlparse(iri) - - pp= [(parti,part) for parti, part in enumerate(parts)] - res=[]; - for p in pp: - res.append(p[1] if p[0] != 4 else quote(p[1] )) - - return urlunparse(res); - - - - -extract = re.compile('\[(\w+)\]'); - -charset = re.compile(r'content="text/html;.?charset=(.*?)"'); -class HTTPItem(extends.EObject): - def __init__(self): - self.Url = '' - self.Cookie = ''; - self.Headers = None; - self.Timeout = 30; - self.opener = ""; - self.postdata='' - - def PraseURL(self, url): - u = Para2Dict(urlparse(self.Url).query, '&', '='); - for r in extract.findall(url): - url = url.replace('[' + r + ']', u[r]) - return url; - - def GetHTML(self, destUrl=None): - if destUrl is None: - destUrl = self.Url; - destUrl = self.PraseURL(destUrl); - socket.setdefaulttimeout(self.Timeout); - cj = http.cookiejar.CookieJar() - pro = urllib.request.HTTPCookieProcessor(cj) - opener = urllib.request.build_opener(pro) - t = [(r, self.Headers[r]) for r in self.Headers]; - opener.addheaders = t; - binary_data = self.postdata.encode('utf-8') - try: - destUrl.encode('ascii') - except UnicodeEncodeError: - destUrl = iriToUri(destUrl) - - try: - if self.postdata=='': - page=opener.open(destUrl); - else: - page = opener.open(destUrl, binary_data) - html = page.read() - except Exception as e: - print(e); - return "" - - - if page.info().get('Content-Encoding') == 'gzip': - html = gzip.decompress(html) - encoding = charset.search(str(html)) - if encoding is not None: - encoding = encoding.group(1); - if encoding is None: - encoding = 'utf-8' - try: - html=html.decode(encoding) - except UnicodeDecodeError as e: - print(e); - import chardet - encoding= chardet.detect(html) - html=html.decode(encoding); - - return html; - - -# 解压函数 -def ungzip(data): - data = gzip.decompress(data) - return data; - -def IsNone(data): - return data is None or data==''; - -def __getnodetext__(node, arrs): - t=node.text; - if t is not None: - s = t.strip(); - if s != '': - arrs.append(s) - for sub in node.iterchildren(): - __getnodetext__(sub,arrs) - -def getnodetext(node): - if node is None: - return "" - arrs=[]; - __getnodetext__(node,arrs); - return ' '.join(arrs); - - -class SmartCrawler(extends.EObject): - def __init__(self): - self.IsMultiData = "List"; - self.HttpItem = None; - self.Name = None; - self.CrawItems = None; - self.Login = ""; - self.haslogin = False; - self.RootXPath='' - - def autologin(self, loginItem): - if loginItem.postdata is None: - return; - import http.cookiejar - cj = http.cookiejar.CookieJar() - pro = urllib.request.HTTPCookieProcessor(cj) - opener = urllib.request.build_opener(pro) - t = [(r, loginItem.Headers[r]) for r in loginItem.Headers]; - opener.addheaders = t; - binary_data = loginItem.postdata.encode('utf-8') - op = opener.open(loginItem.Url, binary_data) - data = op.read().decode('utf-8') - print(data) - self.HttpItem.Url = op.url; - return opener; - - def CrawData(self, url): - - if self.Login !="" and self.haslogin == False: - self.HttpItem.opener = self.autologin(self.Login); - self.haslogin = True; - html = self.HttpItem.GetHTML(url); - - root =None if html=='' else etree.HTML(html); - if root is None: - return {} if self.IsMultiData == 'One' else []; - - tree = etree.ElementTree(root); - if isinstance(self.CrawItems, list) and len(self.CrawItems) == 0: - return {'Content': html}; - - return self.GetDataFromCrawItems(tree ); - - def GetDataFromCrawItems(self,tree): - documents = []; - if self.IsMultiData =='One': - document = {}; - for r in self.CrawItems: - data = GetDataFromXPath(tree, r.XPath); - if data is not None: - document[r.Name] = data; - else: - document[r.Name] = ""; - return document; - else: - if not IsNone(self.RootXPath): - rootXPath = self.RootXPath; - else: - rootXPath = GetMaxCompareXPath(self.CrawItems); - nodes = tree.xpath(rootXPath) - if nodes is not None: - for node in nodes: - document = {}; - for r in self.CrawItems: - path=r.XPath; - if IsNone(self.RootXPath): - paths=r.XPath.split('/'); - path='/'.join(paths[len(rootXPath.split('/')):len(paths)]); - else: - path= tree.getpath(node)+ path; - data = GetDataFromXPath(node,path); - if data is not None: - document[r.Name] = data; - if len(document) == 0: - continue; - documents.append(document); - return documents; - -def Para2Dict(para, split1, split2): - r = {}; - for s in para.split(split1): - rs = s.split(split2); - if len(rs) < 2: - continue; - key = rs[0]; - value = s[len(key) + 1:]; - r[rs[0]] = value; - - return r; - - -def GetHTML(url, code=None): - url = url.strip(); - if not url.startswith('http'): - url = 'http://' + url; - print("auto transform %s" % (url)); - socket.setdefaulttimeout(30) - i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", - "Accept": "text/plain"} - req = urllib.request.Request(url=url, headers=i_headers) - page = urllib.request.urlopen(req) - html = page.read() - return html; - - -def GetHTMLFromFile(fname): - f = open(fname, 'r', 'utf-8'); - r = f.read(); - return r; - - -def GetCrawNode(craws, name, tree): - for r in craws: - if r.Name == name: - return tree.xpath(r.XPath); - return None; - - -def GetImageFormat(name): - if name is None: - return None, None; - p = name.split('.'); - if len(p) != 2: - return name, 'jpg'; - - back = p[-1]; - if back == "jpg" or back == "png" or back == "gif": # back=="png" ignore because png is so big! - return p[-2], back; - return None, None; - - -def GetCrawData(crawitems, tree): - doc = {}; - for crawItem in crawitems: - node = tree.xpath(crawItem.XPath); - if len(node) == 0: - if crawItem.IsMust: - return; - if crawItem.IsHTMLorText is False: - text = node[0].text; - else: - text = etree.tostring(node[0]); - doc[crawItem.Name] = text; - return doc; - - -def GetHtmlTree(html): - root = etree.HTML(html); - tree = etree.ElementTree(root); - return tree; diff --git a/text.py b/text.py new file mode 100644 index 0000000..75fbc9f --- /dev/null +++ b/text.py @@ -0,0 +1,14 @@ +# -*- encoding: utf-8 -*- +""" +@File : text.py +@Time : 23/8/2019 14:14 +@Author : liyang +""" +import re +t = [{'1':'asgewt'},{'2':'asoiwerogroifndjkl'}] +for i in range(len(t)): + if '1' in t[i]: + flag = i + print('5') +del t[flag] +print(t) \ No newline at end of file diff --git a/xmlFile/2.xml b/xmlFile/2.xml new file mode 100644 index 0000000..4dde095 --- /dev/null +++ b/xmlFile/2.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/xmlFile/demo.xml b/xmlFile/demo.xml new file mode 100644 index 0000000..bcc16a3 --- /dev/null +++ b/xmlFile/demo.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git "a/xmlFile/demo\347\232\204\350\241\250.xlsx" "b/xmlFile/demo\347\232\204\350\241\250.xlsx" new file mode 100644 index 0000000..73501ef Binary files /dev/null and "b/xmlFile/demo\347\232\204\350\241\250.xlsx" differ diff --git a/xmlFile/project.xml b/xmlFile/project.xml new file mode 100644 index 0000000..954367d --- /dev/null +++ b/xmlFile/project.xml @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/xmlFile/t.xml b/xmlFile/t.xml new file mode 100644 index 0000000..fadc144 --- /dev/null +++ b/xmlFile/t.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file