Skip to content
This repository was archived by the owner on May 6, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ Além do código do crawler do jusbrasil, você encontrará em anexo um segundo

Nos próximos dias devemos definir se continuaremos desenvolvendo a versão principal do crawler ou o novo crawler dentro do site do planalto

Atualizado em 22/03/2020 por Bruno Omena e Arthur Omena
Atualizado em 22/03/2020 por Bruno Omena e Emilly Omena
Binary file modified Result.xlsx
Binary file not shown.
125 changes: 89 additions & 36 deletions robo.ipynb → crawler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,30 @@
"import re\n",
"import spacy\n",
"import time\n",
"import pandas as pd \n",
"import pandas as pd\n",
"import ipywidgets as widgets\n",
"from ipywidgets import HBox, VBox, Label\n",
"from ipywidgets import interact, interact_manual"
"from ipywidgets import interact, interact_manual\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#pip install ipywidgets\n",
"#pip install selenium\n",
"#pip install spacy\n",
"#pip install pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def onlynumbers(gimmestring):\n",
" num = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
Expand All @@ -34,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -158,7 +171,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -170,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -185,7 +198,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -224,7 +237,7 @@
" title = driver.find_element_by_xpath(\"//*[@id='app-root']/div/div/div[1]/div[2]/div[2]/div[{}]/div/h2\".format(x))\n",
" body = driver.find_element_by_xpath(\"//*[@id='app-root']/div/div/div[1]/div[2]/div[2]/div[{}]/div/div[2]\".format(x))\n",
" link = driver.find_element_by_xpath(\"//*[@id='app-root']/div/div/div[1]/div[2]/div[2]/div[{}]/div/h2/a\".format(x))\n",
" link_text = link.get_attribute(\"hre\n",
" link_text = link.get_attribute(\"href\")\n",
" ## Extração dos campos NomeDiario, Data e Página\n",
" diary, date, page = extract_data_from_title(title.text)\n",
" \n",
Expand All @@ -236,7 +249,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -267,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -293,7 +306,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -302,7 +315,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -311,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -320,7 +333,7 @@
},
{
"cell_type": "code",
"execution_count": 135,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -336,33 +349,62 @@
" \n",
" result_count = searchcount()\n",
" result_num = int(onlynumbers(result_count))\n",
" print(str(result_num)+\" resultados encontrados\")\n",
" print(\"foram encontrados \"+str(result_num)+\" resultados\")\n",
" \n",
" pages_txt = int(result_num/10)+1\n",
" print(str(pages_txt)+\" paginas para crawlear\")\n",
" else:\n",
" driver.get(URL+search)\n",
" infoL = page_list()\n",
" try:\n",
" df_all = df_all.append(infoL)\n",
" print(str(len(df_all))+\" resultados encontrados\")\n",
" except:\n",
" print(\"pagina \"+str(pages)+\" sem resultados\")\n",
" \n",
" df_all = df_all.append(infoL)\n",
" sys.stdout.write(\"\\r%s\" % len(df_all)+\" resultados encontrados\")\n",
" sys.stdout.flush()\n",
" \n",
" pages += 1\n",
" \n",
" return df_all"
" return df_all\n",
"\n",
"def get_body(df_all):\n",
" for j in range(len(df_all)):\n",
" driver.get(df_all.iloc[j][\"url\"])\n",
" body = driver.find_element_by_css_selector(\"#app-root > div > div > div.WithMetricsDispatcher > div > div.container > div > div > article > div\")\n",
" body = body.text\n",
" df_all.at[j, 'body'] = body\n",
" time.sleep(0.2)\n",
" return df_all\n",
"\n",
"def filtrar_fonte(df_all):\n",
" filter_list = ['Tribunal','tribunal','Justica','justica','Justiça','justiça']\n",
" filtered_df = pd.DataFrame()\n",
" \n",
" for j in range(len(df_all)):\n",
" insert = 1\n",
" if (insert == 1):\n",
" for fil in filter_list:\n",
" if(fil in df_all.iloc[j][\"titulo\"]):\n",
" insert = 0\n",
" if (insert == 1):\n",
" filtered_df = filtered_df.append(df_all.iloc[j])\n",
" return filtered_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Troque \"result_path\" pelo local onde deseja salvar o arquivo .xlsx"
]
},
{
"cell_type": "code",
"execution_count": 138,
"execution_count": 183,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9732db1933c4bcea0f36f1b6ee1544a",
"model_id": "19111716d4a642968e44ec332987b8a2",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -375,6 +417,16 @@
}
],
"source": [
"## Interact é uma função do pacote 'ipywidgets', que forma o console que você verá abaixo\n",
"\n",
"# escolha_url é um separador criado com base nos tipos de fonte disponível no site do datajus\n",
"# Escolher uma fonte te mostrará EXCLUSIVAMENTE aquele tipo de fonte, a não ser que deixe selecionada a opção \"Tudo\"\n",
"\n",
"# date_from e date_to, servem para encontrar documentos dentro daquele período\n",
"\n",
"# em KEYWORD, insira o termo a ser pesquisado\n",
"\n",
"# a opção JURIS\n",
"@interact_manual\n",
"def choose(escolha_url=url_list[\"descricao\"],\n",
" date_from = widgets.DatePicker(\n",
Expand All @@ -385,12 +437,7 @@
" disabled=False),\n",
" keyword = widgets.Text(\n",
" value=\"covid+fechamento\",\n",
" description=\"keywords\"),\n",
" juris = widgets.SelectMultiple(\n",
" options=jurisprudencia[\"descricao\"],\n",
" value=['none'],\n",
" disabled=False,\n",
" style={'description_width': 'initial'})):\n",
" description=\"keywords\")):\n",
" \n",
" for i in range(len(url_list)):\n",
" if(escolha_url==url_list.iloc[i][\"descricao\"]):\n",
Expand All @@ -402,16 +449,22 @@
" other = ''\n",
" \n",
" if(url=='none'):\n",
" df_all = find_diarios('https://www.jusbrasil.com.br/diarios/','covid+fechamento',other)\n",
" df_all = df_all.reset_index()\n",
" df_all.to_excel(r'Result.xlsx', )\n",
" return df_all\n",
" string = 'https://www.jusbrasil.com.br/diarios/'\n",
" else:\n",
" string = 'https://www.jusbrasil.com.br/'+url\n",
" df_all = find_diarios(string,'covid+fechamento',other)\n",
" df_all = df_all.reset_index()\n",
" df_all.to_excel(r'Result.xlsx', )\n",
" return df_all"
" \n",
" df_all = find_diarios(string,'covid+fechamento',other)\n",
" df_all = df_all.reset_index()\n",
" df_all = df_all.drop(labels=\"index\", axis=1)\n",
" df_all = df_all.rename(columns={0:\"titulo\",1:\"data\",2:\"pagina\",3:\"url\",4:\"body\"})\n",
" \n",
" df_all = filtrar_fonte(df_all)\n",
" df_all = df_all.reset_index()\n",
" df_all = df_all.drop(labels=\"index\", axis=1)\n",
" df_all = get_body(df_all)\n",
" \n",
" df_all.to_excel(\"Result.xlsx\",sheet_name='data')\n",
" return df_all"
]
},
{
Expand Down
Loading