How to get latest offset for a partition for a kafka topic?
Question:
I am using the Python high level consumer for Kafka and want to know the latest offsets for each partition of a topic. However I cannot get it to work.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.assign(ps)
for p in ps:
print "For partition %s highwater is %s"%(p.partition,con.highwater(p))
print "Subscription = %s"%con.subscription()
print "con.seek_to_beginning() = %s"%con.seek_to_beginning()
But the output I get is
For partition 0 highwater is None
For partition 1 highwater is None
For partition 2 highwater is None
For partition 3 highwater is None
For partition 4 highwater is None
For partition 5 highwater is None
....
For partition 96 highwater is None
For partition 97 highwater is None
For partition 98 highwater is None
For partition 99 highwater is None
Subscription = None
con.seek_to_beginning() = None
con.seek_to_end() = None
I have an alternate approach using assign
but the result is the same
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.assign(ps)
for p in ps:
print "For partition %s highwater is %s"%(p.partition,con.highwater(p))
print "Subscription = %s"%con.subscription()
print "con.seek_to_beginning() = %s"%con.seek_to_beginning()
print "con.seek_to_end() = %s"%con.seek_to_end()
It seems from some of the documentation that I might get this behaviour if a fetch
has not been issued. But I cannot find a way to force that. What am I doing wrong?
Or is there a different/simpler way to get the latest offsets for a topic?
Answers:
Finally after spending a day on this and several false starts, I was able to find a solution and get it working. Posting it her so that others may refer to it.
from kafka import SimpleClient
from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy
from kafka.common import OffsetRequestPayload
client = SimpleClient(brokers)
partitions = client.topic_partitions[topic]
offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()]
offsets_responses = client.send_offset_request(offset_requests)
for r in offsets_responses:
print "partition = %s, offset = %s"%(r.partition, r.offsets[0])
If you wish to use Kafka shell scripts present in kafka/bin, then you can get latest and smallest offsets by using kafka-run-class.sh.
To get latest offset command will look like this
bin/kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 --time -1 --topic topiname
To get smallest offset command will look like this
bin/kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 --time -2 --topic topiname
You can find more information on Get Offsets Shell from following link
Hope this helps!
from kafka import KafkaConsumer, TopicPartition
TOPIC = 'MYTOPIC'
GROUP = 'MYGROUP'
BOOTSTRAP_SERVERS = ['kafka01:9092', 'kafka02:9092']
consumer = KafkaConsumer(
bootstrap_servers=BOOTSTRAP_SERVERS,
group_id=GROUP,
enable_auto_commit=False
)
for p in consumer.partitions_for_topic(TOPIC):
tp = TopicPartition(TOPIC, p)
consumer.assign([tp])
committed = consumer.committed(tp)
consumer.seek_to_end(tp)
last_offset = consumer.position(tp)
print("topic: %s partition: %s committed: %s last: %s lag: %s" % (TOPIC, p, committed, last_offset, (last_offset - committed)))
consumer.close(autocommit=False)
Another way to achieve this is by polling the consumer to obtain the last consumed offset and then using the seek_to_end method to obtain the most recent available offset partition.
from kafka import KafkaConsumer
consumer = KafkaConsumer('my-topic',
group_id='my-group',
bootstrap_servers=['localhost:9092'])
consumer.poll()
consumer.seek_to_end()
This method particularly comes in handy when using consumer groups.
SOURCES:
With kafka-python>=1.3.4
you can use:
kafka.KafkaConsumer.end_offsets(partitions)
Get the last offset for the given partitions. The last offset of a partition is the offset of the upcoming message, i.e. the offset of the last available message + 1.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.end_offsets(ps)
Using confluent-kafka-python
You can use position
:
Retrieve current positions (offsets) for the list of partitions.
from confluent_kafka import Consumer, TopicPartition
consumer = Consumer({"bootstrap.servers": "localhost:9092"})
topic = consumer.list_topics(topic='topicName')
partitions = [TopicPartition('topicName', partition) for partition in list(topic.topics['topicName'].partitions.keys())]
offset_per_partition = consumer.position(partitions)
Alternatively, you can also use get_watermark_offsets
but you’d have to pass one partition at a time and thus it requires multiple calls:
Retrieve low and high offsets for partition.
from confluent_kafka import Consumer, TopicPartition
consumer = Consumer({"bootstrap.servers": "localhost:9092"})
topic = consumer.list_topics(topic='topicName')
partitions = [TopicPartition('topicName', partition) for partition in list(topic.topics['topicName'].partitions.keys())]
for p in partitions:
low_offset, high_offset = consumer.get_watermark_offsets(p)
print(f"Latest offset for partition {p}: {high_offset}")
Using kafka-python
You can use end_offsets
:
Get the last offset for the given partitions. The last offset of a
partition is the offset of the upcoming message, i.e. the offset of
the last available message + 1.
This method does not change the current consumer position of the
partitions.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
consumer = KafkaConsumer(bootstrap_servers = "localhost:9092" )
partitions= = [TopicPartition('myTopic', p) for p in consumer.partitions_for_topic('myTopic')]
last_offset_per_partition = consumer.end_offsets(partitions)
kafka-consumer-groups --bootstrap-server host1:9093,crow-host2:9093,host3:9093 --command-config=/root/client.properties --describe --group atlas
This command will show the status. Lag/offset
Using kafka-python
While defining the consumer, argument auto_offset_reset
can be set either to 'earliest'
or 'latest'
. This is useful incase consumer starts after the retention period and/or restarts after breaking down, messages will be consumed as per auto.offset.reset
configuration
from kafka import KafkaConsumer
consumer = KafkaConsumer(
'my-topic',
bootstrap_servers=['localhost:9092'],
auto_offset_reset='latest',
enable_auto_commit=True,
group_id='my-group',
value_deserializer=lambda x: loads(x.decode('utf-8')))
see this example.
I am using the Python high level consumer for Kafka and want to know the latest offsets for each partition of a topic. However I cannot get it to work.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.assign(ps)
for p in ps:
print "For partition %s highwater is %s"%(p.partition,con.highwater(p))
print "Subscription = %s"%con.subscription()
print "con.seek_to_beginning() = %s"%con.seek_to_beginning()
But the output I get is
For partition 0 highwater is None
For partition 1 highwater is None
For partition 2 highwater is None
For partition 3 highwater is None
For partition 4 highwater is None
For partition 5 highwater is None
....
For partition 96 highwater is None
For partition 97 highwater is None
For partition 98 highwater is None
For partition 99 highwater is None
Subscription = None
con.seek_to_beginning() = None
con.seek_to_end() = None
I have an alternate approach using assign
but the result is the same
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.assign(ps)
for p in ps:
print "For partition %s highwater is %s"%(p.partition,con.highwater(p))
print "Subscription = %s"%con.subscription()
print "con.seek_to_beginning() = %s"%con.seek_to_beginning()
print "con.seek_to_end() = %s"%con.seek_to_end()
It seems from some of the documentation that I might get this behaviour if a fetch
has not been issued. But I cannot find a way to force that. What am I doing wrong?
Or is there a different/simpler way to get the latest offsets for a topic?
Finally after spending a day on this and several false starts, I was able to find a solution and get it working. Posting it her so that others may refer to it.
from kafka import SimpleClient
from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy
from kafka.common import OffsetRequestPayload
client = SimpleClient(brokers)
partitions = client.topic_partitions[topic]
offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()]
offsets_responses = client.send_offset_request(offset_requests)
for r in offsets_responses:
print "partition = %s, offset = %s"%(r.partition, r.offsets[0])
If you wish to use Kafka shell scripts present in kafka/bin, then you can get latest and smallest offsets by using kafka-run-class.sh.
To get latest offset command will look like this
bin/kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 --time -1 --topic topiname
To get smallest offset command will look like this
bin/kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 --time -2 --topic topiname
You can find more information on Get Offsets Shell from following link
Hope this helps!
from kafka import KafkaConsumer, TopicPartition
TOPIC = 'MYTOPIC'
GROUP = 'MYGROUP'
BOOTSTRAP_SERVERS = ['kafka01:9092', 'kafka02:9092']
consumer = KafkaConsumer(
bootstrap_servers=BOOTSTRAP_SERVERS,
group_id=GROUP,
enable_auto_commit=False
)
for p in consumer.partitions_for_topic(TOPIC):
tp = TopicPartition(TOPIC, p)
consumer.assign([tp])
committed = consumer.committed(tp)
consumer.seek_to_end(tp)
last_offset = consumer.position(tp)
print("topic: %s partition: %s committed: %s last: %s lag: %s" % (TOPIC, p, committed, last_offset, (last_offset - committed)))
consumer.close(autocommit=False)
Another way to achieve this is by polling the consumer to obtain the last consumed offset and then using the seek_to_end method to obtain the most recent available offset partition.
from kafka import KafkaConsumer
consumer = KafkaConsumer('my-topic',
group_id='my-group',
bootstrap_servers=['localhost:9092'])
consumer.poll()
consumer.seek_to_end()
This method particularly comes in handy when using consumer groups.
SOURCES:
With kafka-python>=1.3.4
you can use:
kafka.KafkaConsumer.end_offsets(partitions)
Get the last offset for the given partitions. The last offset of a partition is the offset of the upcoming message, i.e. the offset of the last available message + 1.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
con = KafkaConsumer(bootstrap_servers = brokers)
ps = [TopicPartition(topic, p) for p in con.partitions_for_topic(topic)]
con.end_offsets(ps)
Using confluent-kafka-python
You can use position
:
Retrieve current positions (offsets) for the list of partitions.
from confluent_kafka import Consumer, TopicPartition
consumer = Consumer({"bootstrap.servers": "localhost:9092"})
topic = consumer.list_topics(topic='topicName')
partitions = [TopicPartition('topicName', partition) for partition in list(topic.topics['topicName'].partitions.keys())]
offset_per_partition = consumer.position(partitions)
Alternatively, you can also use get_watermark_offsets
but you’d have to pass one partition at a time and thus it requires multiple calls:
Retrieve low and high offsets for partition.
from confluent_kafka import Consumer, TopicPartition
consumer = Consumer({"bootstrap.servers": "localhost:9092"})
topic = consumer.list_topics(topic='topicName')
partitions = [TopicPartition('topicName', partition) for partition in list(topic.topics['topicName'].partitions.keys())]
for p in partitions:
low_offset, high_offset = consumer.get_watermark_offsets(p)
print(f"Latest offset for partition {p}: {high_offset}")
Using kafka-python
You can use end_offsets
:
Get the last offset for the given partitions. The last offset of a
partition is the offset of the upcoming message, i.e. the offset of
the last available message + 1.This method does not change the current consumer position of the
partitions.
from kafka import TopicPartition
from kafka.consumer import KafkaConsumer
consumer = KafkaConsumer(bootstrap_servers = "localhost:9092" )
partitions= = [TopicPartition('myTopic', p) for p in consumer.partitions_for_topic('myTopic')]
last_offset_per_partition = consumer.end_offsets(partitions)
kafka-consumer-groups --bootstrap-server host1:9093,crow-host2:9093,host3:9093 --command-config=/root/client.properties --describe --group atlas
This command will show the status. Lag/offset
Using kafka-python
While defining the consumer, argument auto_offset_reset
can be set either to 'earliest'
or 'latest'
. This is useful incase consumer starts after the retention period and/or restarts after breaking down, messages will be consumed as per auto.offset.reset
configuration
from kafka import KafkaConsumer
consumer = KafkaConsumer(
'my-topic',
bootstrap_servers=['localhost:9092'],
auto_offset_reset='latest',
enable_auto_commit=True,
group_id='my-group',
value_deserializer=lambda x: loads(x.decode('utf-8')))
see this example.