-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
214 lines (162 loc) · 7.78 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# ========================= convert text data into json data =============
# import json
# # Initialize an empty list to store product dictionaries
# products = []
# # Read data from the file
# with open('data.txt', 'r', encoding='utf-8') as file:
# data = file.read()
# # Split the data into individual products using double newline as a separator
# product_data = data.strip().split('\n\n')
# # Process each product data to create a product dictionary
# for product_info in product_data:
# parts = product_info.split('\n')
# product = {
# "product_name": parts[0],
# "product_image": parts[1],
# "brand": parts[2],
# "types": parts[3],
# "price": parts[4],
# "short_description": parts[5],
# "rating": parts[6]
# }
# products.append(product)
# # Convert the list of product dictionaries to JSON
# json_data = json.dumps(products, indent=4, ensure_ascii=False)
# # Save the JSON data to a file
# with open('symphony.json', 'w', encoding='utf-8') as json_file:
# json_file.write(json_data)
# print("JSON data has been successfully created and saved to 'products.json'.")
# ====================== web crawling ================
# import requests
# from bs4 import BeautifulSoup
# import json
# # Initialize an empty list to store the extracted data
# all_data = []
# # Read the list of URLs from the data.txt file
# with open('data.txt', 'r') as file:
# urls = file.read().splitlines()
# # Iterate through each URL
# for url in urls:
# url = url.strip() # Remove leading/trailing spaces
# if not url:
# continue # Skip empty lines
# try:
# # Send an HTTP GET request to the URL
# response = requests.get(url)
# response.raise_for_status() # Check for HTTP request errors
# # Parse the HTML content of the page using BeautifulSoup
# soup = BeautifulSoup(response.text, 'html.parser')
# # Find all tables on the page
# tables = soup.find_all('table')
# # Check if there are at least two tables on the page
# if len(tables) >= 2:
# # Get the second table (index 1) and extract its data
# second_table = tables[1]
# table_data = {}
# for row in second_table.find_all('tr'):
# columns = row.find_all('td')
# if len(columns) == 2:
# key = columns[0].get_text().strip()
# value = columns[1].get_text().strip()
# table_data[key] = value
# # Add the extracted data to the list
# all_data.append(table_data)
# else:
# print(f"There are not enough tables on the page to extract data for {url}")
# except requests.exceptions.RequestException as e:
# print(f"Failed to retrieve the page for {url}: {str(e)}")
# # Convert the list of data to JSON format
# json_data = json.dumps(all_data, indent=4, ensure_ascii=False)
# # Save the JSON data to a file
# with open('details.json', 'w', encoding='utf-8') as json_file:
# json_file.write(json_data)
# print("JSON data has been successfully created and saved to 'details.json'.")
# ================= web crawling 2 =========================
# import requests
# from bs4 import BeautifulSoup
# import json
# # Function to scrape data from a URL
# def scrape_data(url):
# # Send an HTTP GET request to the URL
# response = requests.get(url)
# # Create a dictionary to store the extracted data
# result = {}
# base_url = "https://www.mobiledokan.com"
# # Check if the request was successful
# if response.status_code == 200:
# # Parse the HTML content of the page
# soup = BeautifulSoup(response.text, 'html.parser')
# # Find all the figure tags with class 'size-full'
# figure_tags = soup.find_all('figure', class_=['size-full', 'size-large'])
# # Extract and store the image sources in the result dictionary
# image_sources = []
# for figure in figure_tags:
# img_tag = figure.find('img')
# if img_tag:
# img_src = img_tag['data-src']
# image_sources.append( base_url+img_src)
# # Find the "Highlights" header
# highlights_header = soup.find('h3', text='Highlights')
# if highlights_header:
# # Find the next three <p> tags after the "Highlights" header
# p_tags = highlights_header.find_all_next('p', limit=3)
# # Extract and store the content of the <p> tags in the result dictionary
# paragraph_texts = [p_tag.get_text() for p_tag in p_tags]
# # Store the image sources and details in the result dictionary
# result["image1"] = image_sources[0] if len(image_sources) > 0 else ""
# result["image2"] = image_sources[1] if len(image_sources) > 1 else ""
# result["image3"] = image_sources[2] if len(image_sources) > 2 else ""
# result["details"] = " ".join(paragraph_texts)
# return result
# else:
# print(f"Failed to retrieve the webpage at {url}. Status code: {response.status_code}")
# return None
# # Read the list of URLs from the 'data.txt' file
# with open('data.txt', 'r') as file:
# urls = file.read().splitlines()
# # Create a list to store the scraped data
# scraped_data = []
# # Iterate through the URLs and scrape data
# for url in urls:
# print(f"Scraping data from {url}")
# data = scrape_data(url)
# if data:
# scraped_data.append(data)
# # Save the scraped data to a single JSON file
# with open('phone2.json', 'w') as json_file:
# json.dump(scraped_data, json_file, indent=4)
# print("Scraped data saved to 'phone2.json'")
# {
# "image1":"https://www.mobiledokan.com/wp-content/uploads/2023/09/Xiaomi-13-Lite.jpg",
# "image2":"https://www.mobiledokan.com/wp-content/uploads/2023/09/Xiaomi-13-Lite-blue.webp",
# "image3": "https://www.mobiledokan.com/wp-content/uploads/2023/09/Xiaomi-13-Lite-colors.webp",
# "details": "Xiaomi 13 Lite is the Lite version of Xiaomi 13. Lite usually means with lower price and configuration. The main highlight of this gadget is its Dual 32+8 MP front camera which is rare these days. The main 32 MP lens captures 100˚ photos which makes it an ultrawide lens. Another great thing is that it charges fully in a maximum of 40 minutes although the battery is 4500 mAh and not a typical 5000 mAh one. The AMOLED display is pleasing for the eyes with 68B colors, Dolby Vision, and HDR10+ features.The 4 nm Snapdragon 7 Gen 1 chipset from Qualcomm is a good performance and gaming chipset at this price point. There is also an optical in-display fingerprint sensor. The 13 Lite from Xiaomi is a 5 G-supported smartphone."
# }
# ==================================== merge 2 file ===========================
import json
# Load data from "phone.json" and "phone2.json"
with open("phone.json", "r") as file1, open("phone2.json", "r") as file2:
data1 = json.load(file1)
data2 = json.load(file2)
# Merge the data into a single list
merged_data = []
for entry1, entry2 in zip(data1, data2):
# Combine data from both files into a single dictionary
merged_entry = {
"product_name": entry1["product_name"],
"product_image": entry1["product_image"],
"brand": entry1["brand"],
"types": entry1["types"],
"price": entry1["price"],
"short_description": entry1["short_description"],
"rating": entry1["rating"],
"image1": entry2["image1"],
"image2": entry2["image2"],
"image3": entry2["image3"],
"details": entry2["details"]
}
merged_data.append(merged_entry)
# Save the merged data to "phone3.json"
with open("phone3.json", "w") as output_file:
json.dump(merged_data, output_file, indent=4)
print("Merged data saved to 'phone3.json'")