diff --git a/.gitignore b/.gitignore index ead5279..5b5160d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ venv/ hidden* other_data.json export_result_TG_account_N4.json - +*.json \ No newline at end of file diff --git a/add_user_ids_to_postgres_v2.py b/add_user_ids_to_postgres_v2.py index a36db48..4928e14 100644 --- a/add_user_ids_to_postgres_v2.py +++ b/add_user_ids_to_postgres_v2.py @@ -4,7 +4,7 @@ host=hidden_2.secrets_2()["host"], port=hidden_2.secrets_2()["port"], database=hidden_2.secrets_2()["database"], - user=hidden_2.secrets_2()["user"] + user=hidden_2.secrets_2()["user"], ) cur_1 = connection_1.cursor() @@ -24,13 +24,9 @@ # (w/ Python) Select all data from an existing table & print it # cur_1.execute("SELECT * FROM all_ip_addresses;") # var_2 = cur_1.fetchall() -#print(var_2) -#print(var_1) +# print(var_2) +# print(var_1) # (w/ Python) Connect to a DB in Postgres # (w/ Python) Create a new table in Postgres w/ necessary columns - - - - diff --git a/config.py b/config.py new file mode 100644 index 0000000..41fcd8a --- /dev/null +++ b/config.py @@ -0,0 +1,4 @@ +import os + +# DATABASE_STRING = os.getenv("DATABASE_STRING", "sqlite:///my_db.sqlite") +DATABASE_STRING = os.getenv("DATABASE_STRING", "postgresql://postgres:postgres@localhost/postgres") diff --git a/get_all_data_from_specific_chat.py b/get_all_data_from_specific_chat.py index b668707..87f039f 100644 --- a/get_all_data_from_specific_chat.py +++ b/get_all_data_from_specific_chat.py @@ -1,32 +1,37 @@ -import json +import json -with open('export_result_TG_account_N4.json', 'r') as json_file: - exported_data_nested_dict = json.load(json_file) +with open("export_result_TG_account_N4.json", "r") as json_file: + exported_data_nested_dict = json.load(json_file) -chat_id_input = input('Paste Telegram chat-ID (numbers only) to export all previous data from this chat: ') +chat_id_input = input( + "Paste Telegram chat-ID (numbers only) to export all previous data from this chat: " +) for k in exported_data_nested_dict["chats"]["list"]: - if chat_id_input == str(k["id"]): - chat_data_json_string = json.dumps(k) - print('\nFound {} chat-ID among your chats. \nHere is all raw data from this chat: '.format(chat_id_input)) - print(chat_data_json_string) - - # print(type(chat_data_json_string)) - # print(k) - # print(type(k)) - - - # if chat_id_input != str(k["id"]): - # break - # else: - # print('Found this chat-ID') - - # try: - # chat_id_input == str(k["id"]) - # except: - # print('NOT found this chat-ID') - # else: - # print('Found this chat-ID') + if chat_id_input == str(k["id"]): + chat_data_json_string = json.dumps(k) + print( + "\nFound {} chat-ID among your chats. \nHere is all raw data from this chat: ".format( + chat_id_input + ) + ) + print(chat_data_json_string) + + # print(type(chat_data_json_string)) + # print(k) + # print(type(k)) + + # if chat_id_input != str(k["id"]): + # break + # else: + # print('Found this chat-ID') + + # try: + # chat_id_input == str(k["id"]) + # except: + # print('NOT found this chat-ID') + # else: + # print('Found this chat-ID') # all_chat_ids_list = [] @@ -34,15 +39,15 @@ # all_chat_ids_list.append(str(k["id"])) -# if chat_id_input in all_chat_ids_list: +# if chat_id_input in all_chat_ids_list: # index_of_chat = ?? # print('Found this chat-ID') -# else: -# print('NOT found this chat-ID') +# else: +# print('NOT found this chat-ID') - # chat_id = k["id"] - # print(chat_id) +# chat_id = k["id"] +# print(chat_id) # print(all_chat_ids_list) # number_of_chats = len(exported_data_nested_dict) @@ -51,10 +56,10 @@ # print(type(exported_data_nested_dict["chats"]["list"])) - # exported_data_string = json_file.read() +# exported_data_string = json_file.read() # print(exported_data_nested_dict) -# print(type(exported_data_nested_dict)) +# print(type(exported_data_nested_dict)) # print(chat_id_input) -# print(type(chat_id_input)) +# print(type(chat_id_input)) diff --git a/get_list_all_IPs.py b/get_list_all_IPs.py index 592d111..0395e31 100644 --- a/get_list_all_IPs.py +++ b/get_list_all_IPs.py @@ -1,37 +1,38 @@ -import json, psycopg2 -import hidden_2 # hidden_2.py file in repo with secrets about connection to the database +import json, psycopg2 +import hidden_2 # hidden_2.py file in repo with secrets about connection to the database conn_2 = psycopg2.connect( - host=hidden_2.secrets_2()['host'], - port=hidden_2.secrets_2()['port'], - database=hidden_2.secrets_2()['database'], - user=hidden_2.secrets_2()['user']) + host=hidden_2.secrets_2()["host"], + port=hidden_2.secrets_2()["port"], + database=hidden_2.secrets_2()["database"], + user=hidden_2.secrets_2()["user"], +) cur_2 = conn_2.cursor() # print(cur_2) # print(type(cur_2)) -var_1 = 'CREATE TABLE if not exists all_IP_addresses (id SERIAL, IP_address TEXT);' +var_1 = "CREATE TABLE if not exists all_IP_addresses (id SERIAL, IP_address TEXT);" cur_2.execute(var_1) conn_2.commit() # Flush to database server -with open('export_result_TG_account_N4.json', 'r') as json_file: - nested_dict = json.load(json_file) -list_1 = nested_dict['other_data']['ips'] +with open("export_result_TG_account_N4.json", "r") as json_file: + nested_dict = json.load(json_file) +list_1 = nested_dict["other_data"]["ips"] # print(list_1) -for dict_1 in list_1: - var_2 = 'INSERT INTO all_ip_addresses (ip_address) VALUES (%s);' - cur_2.execute(var_2, (dict_1['ip'], )) -conn_2.commit() +for dict_1 in list_1: + var_2 = "INSERT INTO all_ip_addresses (ip_address) VALUES (%s);" + cur_2.execute(var_2, (dict_1["ip"],)) +conn_2.commit() -cur_2.execute('SELECT * FROM all_ip_addresses;') +cur_2.execute("SELECT * FROM all_ip_addresses;") var_3 = cur_2.fetchall() # print(type(var_3)) -print('Updated database: ', var_3) # To show the results via Terminal +print("Updated database: ", var_3) # To show the results via Terminal - # print(dict_1['ip']) # Get all IP addresses +# print(dict_1['ip']) # Get all IP addresses # var_2 = 'INSERT INTO all_ip_addresses (ip_address) VALUES ;' # cur_2.execute(var_2) # conn_2.commit() @@ -45,7 +46,7 @@ # cur_1 = conn_1.cursor() # cur_1.execute('SELECT * FROM account;') -# a_1 = cur_1.fetchall() +# a_1 = cur_1.fetchall() # print(a_1) # print(type(a_1)) diff --git a/models.py b/models.py new file mode 100644 index 0000000..93a979a --- /dev/null +++ b/models.py @@ -0,0 +1,50 @@ +from sqlalchemy import Column +from sqlalchemy import ForeignKey +from sqlalchemy import Integer, BigInteger +from sqlalchemy import String +from sqlalchemy import DateTime +from sqlalchemy.orm import declarative_base +from sqlalchemy.orm import relationship +from sqlalchemy import create_engine +import config + +Base = declarative_base() + +engine = create_engine(config.DATABASE_STRING, echo=True, future=True) + + +class TelegramChat(Base): + __tablename__ = "telegram_chat" + + id = Column(BigInteger, primary_key=True) + telegram_id = Column(BigInteger, unique=True) + name = Column(String(100)) + type = Column(String(50)) + + messages = relationship( + "TelegramMessage", back_populates="message", cascade="all, delete" + ) + + def __repr__(self): + return f"Chat(id={self.chat_id!r}, name={self.name!r})" + + +class TelegramMessage(Base): + __tablename__ = "telegram_message" + + id = Column(BigInteger, primary_key=True) + telegram_id = Column(BigInteger, unique=True) + date = Column(DateTime) + unix_timestamp = Column(BigInteger) + from_name = Column(String(100)) + from_id = Column(String(100)) + + chat_id = Column(BigInteger, ForeignKey("telegram_chat.telegram_id"), nullable=False) + + chat = relationship("TelegramChat", back_populates="messages") + + def __repr__(self): + return f"Message(id={self.id!r}, from_name={self.from_name!r})" + + +Base.metadata.create_all(engine) diff --git a/my_db.sqlite b/my_db.sqlite new file mode 100644 index 0000000..a76e94c Binary files /dev/null and b/my_db.sqlite differ diff --git a/parse_fields.py b/parse_fields.py new file mode 100644 index 0000000..c750b7d --- /dev/null +++ b/parse_fields.py @@ -0,0 +1,21 @@ +import json + +with open("result.json") as f: + data = json.load(f) + + +chat_fields = set() +message_fields = set() +text_entities_max_count = 0 + +for chat in data["chats"]["list"]: + chat_fields |= set(chat.keys()) + for message in chat["messages"]: + message_fields |= set(message.keys()) + if len(message['text_entities']) > text_entities_max_count: + text_entities_max_count = len(message['text_entities']) + print(message['id']) + +print(chat_fields) +print(message_fields) +print(f'{text_entities_max_count=}') \ No newline at end of file diff --git a/parse_telegram_data.py b/parse_telegram_data.py new file mode 100644 index 0000000..0042e20 --- /dev/null +++ b/parse_telegram_data.py @@ -0,0 +1,47 @@ +import json +from sqlalchemy import create_engine +from sqlalchemy.sql import text + +import config + +engine = create_engine(config.DATABASE_STRING, future=True) + +with open("result.json") as f: + data = json.load(f) + +with engine.connect() as con: + + for chat in data["chats"]["list"]: + statement = text( + """ + INSERT INTO telegram_chat(telegram_id, name, type) + VALUES(:id, :name, :type) + ON CONFLICT (telegram_id) DO NOTHING + """ + ) + con.execute( + statement, + {"id": chat["id"], "name": chat.get("name"), "type": chat["type"]}, + ) + + for message in chat["messages"]: + statement = text( + """ + INSERT INTO telegram_message(telegram_id, date, unix_timestamp, from_name, from_id, chat_id) + VALUES(:id, :date, :unix_timestamp, :from_name, :from_id, :chat_id) + ON CONFLICT (telegram_id) DO NOTHING + """ + ) + con.execute( + statement, + { + "id": message["id"], + "date": message["date"], + "unix_timestamp": message["date_unixtime"], + "from_name": message.get("from"), + "from_id": message.get("from_id"), + "chat_id": chat["id"], + }, + ) + + con.commit() diff --git a/top_frequent_contacts_list.py b/top_frequent_contacts_list.py index 223b249..3b0f5a7 100644 --- a/top_frequent_contacts_list.py +++ b/top_frequent_contacts_list.py @@ -1,24 +1,25 @@ -import json, psycopg2 -import hidden_2 # hidden_2.py file in repo with secrets about connection to the database +import json, psycopg2 +import hidden_2 # hidden_2.py file in repo with secrets about connection to the database conn_3 = psycopg2.connect( - host=hidden_2.secrets_2()['host'], - port=hidden_2.secrets_2()['port'], - database=hidden_2.secrets_2()['database'], - user=hidden_2.secrets_2()['user']) + host=hidden_2.secrets_2()["host"], + port=hidden_2.secrets_2()["port"], + database=hidden_2.secrets_2()["database"], + user=hidden_2.secrets_2()["user"], +) -cur_3 = conn_3.cursor() +cur_3 = conn_3.cursor() -with open('export_result_TG_account_N4.json', 'r') as json_file: - nested_dictionary_export = json.load(json_file) +with open("export_result_TG_account_N4.json", "r") as json_file: + nested_dictionary_export = json.load(json_file) # print(nested_dictionary_export) # print(type(nested_dictionary_export)) -list_3 = nested_dictionary_export['frequent_contacts']['list'] +list_3 = nested_dictionary_export["frequent_contacts"]["list"] # print(list_3) # print(type(list_3)) -var_1 = '''CREATE TABLE if not exists top_frequent_contacts ( +var_1 = """CREATE TABLE if not exists top_frequent_contacts ( id SERIAL, tg_id BIGINT UNIQUE, tg_category VARCHAR(128), @@ -26,21 +27,29 @@ tg_name VARCHAR(128), tg_rating REAL, PRIMARY KEY (id) - );''' + );""" cur_3.execute(var_1) -conn_3.commit() # Flush to database server +conn_3.commit() # Flush to database server -for dict_3 in list_3: - var_2 = 'INSERT INTO top_frequent_contacts (tg_id, tg_category, tg_type, tg_name, tg_rating) VALUES (%s, %s, %s, %s, %s);' - cur_3.execute(var_2, (dict_3['id'], dict_3['category'], dict_3['type'], dict_3['name'], dict_3['rating'])) +for dict_3 in list_3: + var_2 = "INSERT INTO top_frequent_contacts (tg_id, tg_category, tg_type, tg_name, tg_rating) VALUES (%s, %s, %s, %s, %s);" + cur_3.execute( + var_2, + ( + dict_3["id"], + dict_3["category"], + dict_3["type"], + dict_3["name"], + dict_3["rating"], + ), + ) conn_3.commit() -cur_3.execute('SELECT * FROM top_frequent_contacts;') +cur_3.execute("SELECT * FROM top_frequent_contacts;") var_4 = cur_3.fetchall() # print(var_4) # To show the results via Terminal - # for dict_3 in list_3: # (dict_3['id']) # (dict_3['category']) @@ -53,5 +62,5 @@ # print(list_3[0]['rating']) # print(type(list_3[0]['rating'])) - # print(dict_3) - # print(type(dict_3)) +# print(dict_3) +# print(type(dict_3)) diff --git a/v2_experiments_exported_data.py b/v2_experiments_exported_data.py index 0f1cbd1..fafd507 100644 --- a/v2_experiments_exported_data.py +++ b/v2_experiments_exported_data.py @@ -1,14 +1,14 @@ -import json +import json -with open('export_result_TG_account_N4.json', 'r') as json_file: - dict_3 = json.load(json_file) # Load JSON file directly to a Python dictionary +with open("export_result_TG_account_N4.json", "r") as json_file: + dict_3 = json.load(json_file) # Load JSON file directly to a Python dictionary # print(type(dict_3)) -about_section_value = dict_3['about'] +about_section_value = dict_3["about"] # print(about_section_value) # print(type(about_section_value)) -first_name_value = dict_3['contacts']['list'][1]['first_name'] +first_name_value = dict_3["contacts"]["list"][1]["first_name"] # print(first_name_value) # print(type(first_name_value)) diff --git a/v3_add_user_ids_to_postgres.py b/v3_add_user_ids_to_postgres.py index 8434399..7d53133 100644 --- a/v3_add_user_ids_to_postgres.py +++ b/v3_add_user_ids_to_postgres.py @@ -13,7 +13,7 @@ host=os.getenv("HOST"), port=os.getenv("PORT"), database=os.getenv("DATABASE"), - user=os.getenv("USER") + user=os.getenv("USER"), ) cur_3 = connection_3.cursor() @@ -25,9 +25,9 @@ # (TASK) (w/ Python) Insert all the selected CHAT IDs to this table -for chat in correspondence_export_dictionary['chats']['list']: +for chat in correspondence_export_dictionary["chats"]["list"]: var_5 = "INSERT INTO chats (name, type, tgchatid) VALUES (%s, %s, %s);" - cur_3.execute(var_5, (chat.get('name', 'NA'), chat.get('type'), chat.get('id'))) + cur_3.execute(var_5, (chat.get("name", "NA"), chat.get("type"), chat.get("id"))) # var_4 = "INSERT INTO chats (name) VALUES (%s);" # cur_3.execute(var_4, (chat.get('name', 'NA'),)) @@ -38,31 +38,27 @@ # var_4 = "INSERT INTO chats (type) VALUES (%s);" # cur_3.execute(var_4, (chat.get('type', 'NA'),)) # This line worked! Pay attention to the comma between parenthesis at the end - #var_4 = "INSERT INTO chats (name, type, tgchatid) VALUES (%s, %s, %s);" - #cur_3.execute(var_4, (chat.get('name'), chat.get('type'), chat.get('id'))) + # var_4 = "INSERT INTO chats (name, type, tgchatid) VALUES (%s, %s, %s);" + # cur_3.execute(var_4, (chat.get('name'), chat.get('type'), chat.get('id'))) - #print(chat.get('messages')) - #print(chat.get('id', 'NA'), chat.get('type', 'NA'), chat.get('name', 'NA')) - #print(chat.get('name')) - #print(f"name: {chat.get('name', 'NA')}") - #print(chat['name']) - #print(type(chat)) + # print(chat.get('messages')) + # print(chat.get('id', 'NA'), chat.get('type', 'NA'), chat.get('name', 'NA')) + # print(chat.get('name')) + # print(f"name: {chat.get('name', 'NA')}") + # print(chat['name']) + # print(type(chat)) connection_3.commit() - - - - # Variant 1: # for k in dict_3["chats"]["list"]: # (CDL) Adding "if True" condition below is important, as it's better to check if every key exists in every dict # if k['']: # (?) How to bypass the error here? ***"Saved messages" chat's dictionary has NO key "name" # print('yes') # else: # pass - #print(type(k)) - # var_3 = "INSERT INTO v1_tg_data_export(name, type, tgchatid) VALUES(%s, %s, %s)" - # cur_3.execute() +# print(type(k)) +# var_3 = "INSERT INTO v1_tg_data_export(name, type, tgchatid) VALUES(%s, %s, %s)" +# cur_3.execute() # if k['type'] in dict_3["chats"]["list"]: # Variant 2: @@ -70,9 +66,8 @@ # for k in range(): # Try a temporary solution: exclude "Saved messages" chat - -#a_1 = dict_3["chats"]["list"][1]["id"] -#print(a_1) +# a_1 = dict_3["chats"]["list"][1]["id"] +# print(a_1) # var_1 = "INSERT INTO contacts_users (tg_ids) VALUES (78901);" # cur_3.execute(var_1) diff --git a/v4_get_messages_from_chats.py b/v4_get_messages_from_chats.py index 50e4968..c65c663 100644 --- a/v4_get_messages_from_chats.py +++ b/v4_get_messages_from_chats.py @@ -7,7 +7,7 @@ host=os.getenv("HOST"), port=os.getenv("PORT"), database=os.getenv("DATABASE"), - user=os.getenv("USER") + user=os.getenv("USER"), ) cursor_4 = connection_4.cursor() @@ -47,25 +47,35 @@ for chat in export_data_to_dictionary["chats"]["list"]: for message_parameter in chat["messages"]: var_3 = "INSERT INTO messages_1 (tgmessageid, type, date, date_unixtime, fromname, fromid) VALUES (%s, %s, %s, %s, %s, %s)" - cursor_4.execute(var_3, (message_parameter["id"], message_parameter["type"], message_parameter["date"], message_parameter["date_unixtime"], message_parameter["from"], message_parameter["from_id"])) + cursor_4.execute( + var_3, + ( + message_parameter["id"], + message_parameter["type"], + message_parameter["date"], + message_parameter["date_unixtime"], + message_parameter["from"], + message_parameter["from_id"], + ), + ) # var_1 = message_parameter["id"] # var_2 = "INSERT INTO messages_1 (tgmessageid) VALUES (var_1);" # (?) Why does this line create an error? ***psycopg2.errors.UndefinedColumn: column "var_1" does not exist # cursor_4.execute(var_2) connection_4.commit() - # print(message_text) - # print(type(message_text)) +# print(message_text) +# print(type(message_text)) - # print(message_date) - # print(type(message_date)) +# print(message_date) +# print(type(message_date)) - # print(parameters) - # print(type(parameters)) +# print(parameters) +# print(type(parameters)) - # print(message_data) - # print(type(message_data)) - #print(type(chat)) +# print(message_data) +# print(type(message_data)) +# print(type(chat)) # print(list_of_chats) # print(type(list_of_chats))