Skip to content

Commit 7d6fa2e

Browse files
author
Veli Eroglu
committed
merge afetharita-kafka-consumer-address -> deprem-yardim-address-api
1 parent 8a0b61c commit 7d6fa2e

17 files changed

+517
-19
lines changed

.dockerignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
venv/
2+
data.csv
3+
lab.py
4+
.idea/

.env.example

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
CLIENT_ID="Consumer Test"
2+
3+
BOOTSTRAP_SERVERS=<test-kafka>
4+
KAFKA_ADDRESS_RESOLVE_TOPIC="<test-topic>"
5+
KAFKA_PROCESSED_TOPIC="<test-topic>"
6+
INTENT_TARGET_TOPIC="<test-topic>"
7+
KAFKA_INTENT_TOPIC="<test-topic>"
8+
HF_HUB_TOKEN=<hub_token>
9+
10+
GROUP_ID=
11+
12+
# Max message pool count
13+
MAX_POOL_RECORDS=10
14+
MESSAGE_TIMEOUT_MS=200
15+
SECURITY_PROTOCOL=
16+
SASL_MECHANISMS=
17+
SASL_PLAIN_USERNAME=
18+
SASL_PLAIN_PASSWORD=
19+
ENABLE_AUTO_COMMIT=True
20+
OPENAI_API_KEY=<API_KEY>
21+
GOOGLE_API_KEY=<GAPI_KEY>
22+
NER_API_KEY=<NER_API_KEY>
23+
SENTRY_DSN=<SENTRY_DSN>

Dockerfile

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Use Python 3.11 as the base image
2+
FROM python:3.11
3+
4+
# Set the working directory
5+
WORKDIR /projectKafka
6+
ENV PYTHONPATH "."
7+
8+
# Install the required packages
9+
COPY requirements.txt /projectKafka/
10+
RUN pip install -r requirements.txt
11+
12+
# Copy the publisher and consumer scripts
13+
COPY ./ /projectKafka/
14+
# Set the entrypoint
15+
ENTRYPOINT ["python"]
16+
# Set the default command to run the publisher and consumer
17+
CMD ["/projectKafka/consumers/address_resolve.py"]

README.md

+41-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,41 @@
1-
# deprem-yardim-address-api
1+
# Mysterious Project
2+
3+
- Stream Project using Apache Kafka A Python project that uses Apache Kafka as the stream processing framework.
4+
5+
## Prerequisites
6+
- Python 3.10+
7+
- Apache Kafka
8+
9+
## Installation
10+
- Clone the repository to your local machine.
11+
```
12+
git clone https://github.com/[username]/stream-project-kafka.git
13+
```
14+
15+
- Install the required packages mentioned in the requirements.txt file.
16+
```
17+
pip install -r requirements.txt
18+
```
19+
20+
## Running
21+
22+
- Start the address_resolve consumer by running the following command:
23+
24+
```
25+
python consumers/address_resolve.py
26+
```
27+
28+
- Start the processed consumer by running the following command:
29+
30+
```
31+
python consumers/processed.py
32+
```
33+
34+
## Contributing
35+
- Fork the repository.
36+
- Create a new branch for your changes.
37+
- Commit and push your changes to the new branch.
38+
- Create a pull request to the master branch.
39+
40+
## License
41+
This project is licensed under the MIT License.

address_resolver.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,20 @@ def __init__(self, GOOGLE_API_KEY, OPENAI_API_KEY, NER_API_KEY):
2323

2424
def google_geocode_api_request(self, address_text: str, entry_id: int):
2525
result = self.google_api.request(address_text)
26-
result['entry_id'] = entry_id
26+
result['id'] = entry_id
2727
return result
2828

2929
def regex_api_request(self, address_text: str, entry_id: int):
3030
result = self.regex_api.extract(address_text)
31-
result['entry_id'] = entry_id
31+
result['id'] = entry_id
3232
return result
3333

3434
def ner_api_request(self, address_text: str, entry_id: int):
3535
result = self.ner_api.query(address_text)
36-
result['entry_id'] = entry_id
36+
result['id'] = entry_id
3737
return result
3838

3939
def openai_api_request(self, address_text: str, entry_id: int):
4040
result = self.open_api.single_request(address_text)
41-
result['entry_id'] = entry_id
41+
result['id'] = entry_id
4242
return result

config.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from environs import Env
2+
3+
env = Env()
4+
# Read .env into os.environ
5+
env.read_env()
6+
7+
8+
# KAFKA SETTINGS
9+
CLIENT_ID = env.str("CLIENT_ID")
10+
BOOTSTRAP_SERVERS = env.list("BOOTSTRAP_SERVERS")
11+
KAFKA_ADDRESS_RESOLVE_TOPIC=env.str("KAFKA_ADDRESS_RESOLVE_TOPIC")
12+
KAFKA_PROCESSED_TOPIC=env.str("KAFKA_PROCESSED_TOPIC")
13+
KAFKA_INTENT_TOPIC=env.str("KAFKA_INTENT_TOPIC")
14+
INTENT_TARGET_TOPIC=env.str("INTENT_TARGET_TOPIC")
15+
HF_HUB_TOKEN = env.str("HF_HUB_TOKEN")
16+
# GROUP_ID=env.int("GROUP_ID", None)
17+
18+
# Max message pool count
19+
MAX_POOL_RECORDS=env.int("MAX_POOL_RECORDS")
20+
MESSAGE_TIMEOUT_MS=env.int("MESSAGE_TIMEOUT_MS")
21+
SECURITY_PROTOCOL=env.str("SECURITY_PROTOCOL")
22+
SASL_MECHANISMS=env.str("SASL_MECHANISMS")
23+
SASL_PLAIN_USERNAME=env.str("SASL_PLAIN_USERNAME")
24+
SASL_PLAIN_PASSWORD=env.str("SASL_PLAIN_PASSWORD")
25+
ENABLE_AUTO_COMMIT=env.bool("ENABLE_AUTO_COMMIT")
26+
27+
# Third party api key
28+
SENTRY_DSN = env.str("SENTRY_DSN")
29+
GOOGLE_API_KEY = env.str("GOOGLE_API_KEY")
30+
OPENAI_API_KEY = env.str("OPENAI_API_KEY")
31+
NER_API_KEY = env.str("NER_API_KEY")

consumers/__init__.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import logging
2+
from sentry_sdk import capture_exception
3+
from models.conf import KafkaSettings, KafkaConsumerSettings
4+
import aiokafka
5+
6+
class BaseKafkaClient:
7+
def __init__(self, server_settings: KafkaSettings, topic: str):
8+
self.settings = server_settings
9+
consumer_settings = KafkaConsumerSettings(**server_settings.dict())
10+
# init consumer & producer
11+
self.consumer = aiokafka.AIOKafkaConsumer(topic,
12+
**consumer_settings.dict(
13+
exclude_none=True))
14+
self.producer = aiokafka.AIOKafkaProducer(
15+
bootstrap_servers=server_settings.bootstrap_servers,
16+
loop=server_settings.loop)
17+
18+
async def _process_message(self, record: aiokafka.ConsumerRecord):
19+
try:
20+
await self.process_message(record)
21+
except Exception as exc:
22+
logging.error(exc)
23+
capture_exception(exc)
24+
25+
async def process_message(self, record: aiokafka.ConsumerRecord):
26+
...
27+
28+
async def process(self):
29+
data = await self.consumer.getmany(
30+
timeout_ms=self.settings.message_timeout_ms,
31+
max_records=self.settings.max_pool_records)
32+
for topic, records in data.items():
33+
for record in records:
34+
try:
35+
await self._process_message(record)
36+
except Exception as exc:
37+
capture_exception(exc)
38+
39+
async def run(self):
40+
self.running = True
41+
try:
42+
logging.info("Kafka Consumer Started...")
43+
await self.consumer.start()
44+
await self.producer.start()
45+
while self.running:
46+
await self.process()
47+
finally:
48+
logging.info("Kafka Consumer Stopped...")
49+
await self.consumer.stop()
50+
await self.producer.stop()
51+

consumers/address_resolve.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# built-in python
2+
import asyncio
3+
from io import BytesIO
4+
from multiprocessing.pool import ThreadPool
5+
import logging
6+
7+
# third-party
8+
import sentry_sdk
9+
import orjson
10+
import pandas as pd
11+
import aiokafka
12+
13+
# in-house
14+
from consumers import BaseKafkaClient
15+
from models.conf import KafkaSettings
16+
from config import *
17+
from address_resolver import AddressAPI
18+
19+
# set logger level
20+
logger = logging.getLogger()
21+
logger.setLevel(logging.INFO)
22+
23+
sentry_sdk.init(dsn=SENTRY_DSN)
24+
25+
address_api = AddressAPI(GOOGLE_API_KEY, OPENAI_API_KEY, NER_API_KEY)
26+
27+
class AddressResolve(BaseKafkaClient):
28+
29+
async def process_message(self, record: aiokafka.ConsumerRecord):
30+
message = record.value
31+
32+
messageIo = BytesIO(message)
33+
address_df = pd.read_json(messageIo)
34+
address_df_replica = address_df.copy()
35+
36+
regex_results = pd.DataFrame(
37+
[address_api.regex_api_request(raw_text, entry_id) for
38+
raw_text, entry_id in
39+
zip(address_df.raw_text.values, address_df.id.values)])
40+
regex_to_geocode = regex_results[regex_results.ws >= 0.7]
41+
del regex_results
42+
43+
# Ner Process
44+
address_df = address_df[~address_df.id.isin(regex_to_geocode.id.values)]
45+
with ThreadPool(60) as executor:
46+
ner_results = executor.map(
47+
lambda p: address_api.ner_api_request(*p),
48+
zip(address_df.raw_text.values,
49+
address_df.id.values))
50+
ner_results = pd.DataFrame(ner_results)
51+
ner_to_geocode = ner_results[ner_results.ws >= 0.5]
52+
del ner_results
53+
54+
geocode_data = pd.concat([regex_to_geocode[['address', 'id']],
55+
ner_to_geocode[['address', 'id']]], axis=0)
56+
del regex_to_geocode, ner_to_geocode
57+
58+
with ThreadPool(60) as executor:
59+
geocode_data = executor.map(
60+
lambda p: address_api.google_geocode_api_request(*p),
61+
zip(geocode_data.address.values, geocode_data.id.values))
62+
63+
geocode_data = pd.DataFrame(geocode_data)
64+
geocode_data = pd.merge(geocode_data[geocode_data.is_resolved == True],
65+
address_df_replica, on='id', how='left')
66+
del address_df_replica
67+
68+
final_data = []
69+
for d in geocode_data.iterrows():
70+
d = d[1]
71+
final_data.append(
72+
{
73+
'location': {
74+
"formatted_address": d.get('formatted_address', ''),
75+
"latitude": d.get('latitude', 0.0),
76+
"longitude": d.get('longitude', 0.0),
77+
"northeast_lat": d.get('northeast_lat', 0.0),
78+
"northeast_lng": d.get('northeast_lng', 0.0),
79+
"southwest_lat": d.get('southwest_lat', 0.0),
80+
"southwest_lng": d.get('southwest_lng', 0.0),
81+
"entry_id": d.get('id'),
82+
"epoch": d.get('epoch'),
83+
"channel": d.get('channel')},
84+
'feed': {
85+
"id": d.get('id'),
86+
"raw_text": d.get('raw_text'),
87+
"channel": d.get('channel'),
88+
"extra_parameters": d.get('extra_parameters', {}),
89+
"epoch": d.get('epoch')}
90+
}
91+
)
92+
93+
await self.producer.send_and_wait(KAFKA_PROCESSED_TOPIC,
94+
orjson.dumps(final_data))
95+
logger.info("Message Processed.")
96+
97+
98+
if __name__ == '__main__':
99+
loop = asyncio.new_event_loop()
100+
asyncio.set_event_loop(loop)
101+
102+
kafka_settings = KafkaSettings(
103+
loop=loop,
104+
client_id=CLIENT_ID,
105+
bootstrap_servers=BOOTSTRAP_SERVERS,
106+
max_pool_records=MAX_POOL_RECORDS,
107+
message_timeout_ms=MESSAGE_TIMEOUT_MS
108+
)
109+
110+
try:
111+
server = AddressResolve(topic=KAFKA_ADDRESS_RESOLVE_TOPIC, server_settings=kafka_settings)
112+
loop.run_until_complete(server.run())
113+
finally:
114+
loop.run_until_complete(loop.shutdown_asyncgens())
115+
loop.close()

consumers/intent.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# built-in python
2+
import asyncio
3+
4+
# third-party
5+
import logging
6+
import aiokafka
7+
import sentry_sdk
8+
9+
# in-house
10+
from consumers import BaseKafkaClient
11+
from models.conf import KafkaSettings
12+
from config import *
13+
from orjson import loads
14+
from helpers.intent import batch_query
15+
16+
# set logger level
17+
logger = logging.getLogger()
18+
logger.setLevel(logging.INFO)
19+
20+
sentry_sdk.init(dsn=SENTRY_DSN)
21+
22+
class Intent(BaseKafkaClient):
23+
24+
async def process_message(self, record: aiokafka.ConsumerRecord):
25+
message = record.value
26+
message_dict = loads(message)
27+
full_text_list = [i.get("full_text") for i in message_dict if i]
28+
29+
if not full_text_list:
30+
logging.warning(f"Raw text is empty, Message: {message_dict}")
31+
return
32+
33+
response = batch_query(full_text_list, None)
34+
35+
if not response:
36+
logging.warning(f"No response from hugging face endpoint message: {message_dict}")
37+
return
38+
39+
await self.producer.send_and_wait(INTENT_TARGET_TOPIC,
40+
response)
41+
42+
if __name__ == '__main__':
43+
loop = asyncio.new_event_loop()
44+
asyncio.set_event_loop(loop)
45+
46+
kafka_settings = KafkaSettings(
47+
loop=loop,
48+
client_id=CLIENT_ID,
49+
bootstrap_servers=BOOTSTRAP_SERVERS,
50+
max_pool_records=MAX_POOL_RECORDS,
51+
message_timeout_ms=MESSAGE_TIMEOUT_MS
52+
)
53+
54+
try:
55+
server = Intent(topic=KAFKA_INTENT_TOPIC, server_settings=kafka_settings)
56+
loop.run_until_complete(server.run())
57+
finally:
58+
loop.run_until_complete(loop.shutdown_asyncgens())
59+
loop.close()

helpers/google_geocode_api.py

-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
import json
21
import requests
3-
import pandas as pd
4-
from googlemaps import Client as GoogleMaps
5-
import googlemaps
6-
import gmaps
7-
import urllib.parse
82

93
class GoogleGeocodeAPI:
104
def __init__(self, api_key):

0 commit comments

Comments
 (0)