Python kafka.TopicPartition() Examples

The following are 14 code examples of kafka.TopicPartition(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module kafka , or try the search function .
Example #1
Source File: client.py    From search-MjoLniR with MIT License 7 votes vote down vote up
def offsets_for_times(consumer, partitions, timestamp):
    """Augment KafkaConsumer.offsets_for_times to not return None

    Parameters
    ----------
    consumer : kafka.KafkaConsumer
        This consumer must only be used for collecting metadata, and not
        consuming. API's will be used that invalidate consuming.
    partitions : list of kafka.TopicPartition
    timestamp : number
        Timestamp, in seconds since unix epoch, to return offsets for.

    Returns
    -------
    dict from kafka.TopicPartition to integer offset
    """
    # Kafka uses millisecond timestamps
    timestamp_ms = int(timestamp * 1000)
    response = consumer.offsets_for_times({p: timestamp_ms for p in partitions})
    offsets = {}
    for tp, offset_and_timestamp in response.items():
        if offset_and_timestamp is None:
            # No messages exist after timestamp. Fetch latest offset.
            consumer.assign([tp])
            consumer.seek_to_end(tp)
            offsets[tp] = consumer.position(tp)
        else:
            offsets[tp] = offset_and_timestamp.offset
    return offsets 
Example #2
Source File: test_kafka.py    From py-timeexecution with Apache License 2.0 6 votes vote down vote up
def _query_backend(self):
        consumer = KafkaConsumer(
            bootstrap_servers=KAFKA_HOST, value_deserializer=lambda v: JSONSerializer().loads(v.decode('utf-8'))
        )

        tp = TopicPartition(self.topic, 0)
        consumer.assign([tp])

        count = consumer.position(tp)

        consumer.seek(tp, 0)

        metrics = []
        for i in range(count):
            metrics.append(next(consumer))

        return metrics 
Example #3
Source File: kafka_utils_class.py    From warriorframework with Apache License 2.0 6 votes vote down vote up
def assign_partitions(self, partitions):
        """
        Assign partitions to consumer.
        Arguments:
          partitions(list) : list of [topic, partition] lists
            example : [[topic1,1], [topic2,1]]
        Returns:
            None.
        """
        print_info("assigning partitions to consumer {}".format(partitions))
        topic_partitions = [TopicPartition(topic=tup[0], partition=tup[1]) for tup in partitions]
        try:
            self.kafka_consumer.assign(topic_partitions)
            result = True
        except KafkaError as exc:
            print_error("Exception during assiging partitions - {}".format(exc))
            result = False
        return result 
Example #4
Source File: kafka_utils_class.py    From warriorframework with Apache License 2.0 6 votes vote down vote up
def seek_to_position(self, topic, partition, offset):
        """
        Seek to the given offset.
        Arguments:
          topic(str): topic name
          partition(int): partition number
          offset(int): offset number
        Returns:
          result(bool) : False if exception occures, True otherwise
        """
        print_info("seeking to position {}:{}:{}".format(topic, partition, offset))
        topic_partition = TopicPartition(topic=topic, partition=partition)
        try:
            self.kafka_consumer.seek(partition=topic_partition, offset=offset)
            result = True
        except KafkaError as exc:
            print_error("Exception during seek - {}".format(exc))
            result = False
        return result 
Example #5
Source File: messaging.py    From Ad-Insertion-Sample with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def debug(self, topic):
        c=KafkaConsumer(bootstrap_servers=kafka_hosts, client_id=self._client_id , group_id=None, api_version=(0,10))

        # assign/subscribe topic
        partitions=c.partitions_for_topic(topic)
        if not partitions: raise Exception("Topic "+topic+" not exist")
        c.assign([TopicPartition(topic,p) for p in partitions])

        # seek to beginning if needed
        c.seek_to_beginning()

        # fetch messages
        while True:
            partitions=c.poll(100)
            if partitions:
                for p in partitions:
                    for msg in partitions[p]:
                        yield msg.value.decode('utf-8')
            yield ""

        c.close() 
Example #6
Source File: test_client.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def test_offset_for_times(mocker):
    partitions = [kafka.TopicPartition('ut_topic', 0)]
    offsets_for_times = {tp: OffsetAndTimestamp(42, -1) for tp in partitions}
    positions = {tp: 747 for tp in partitions}

    mock = mocker.Mock()
    mock.offsets_for_times.return_value = offsets_for_times
    mock.position.side_effect = lambda tp: positions.get(tp, 0)

    # Uses returned offset for time when provided
    offsets = client.offsets_for_times(mock, partitions, 987654321)
    assert len(offsets) == len(partitions)
    assert all(tp in offsets for tp in partitions)
    assert offsets[partitions[0]] == 42

    # When offsets_for_times returns None returns position at end
    offsets_for_times[partitions[0]] = None
    offsets = client.offsets_for_times(mock, partitions, 987654321)
    assert len(offsets) == len(partitions)
    assert all(tp in offsets for tp in partitions)
    assert offsets[partitions[0]] == 747 
Example #7
Source File: client.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def offset_range_for_timestamp_range(brokers, start, end, topic):
    """Determine OffsetRange for a given timestamp range

    Parameters
    ----------
    client_config : ClientConfig
    start : number
        Unix timestamp in seconds
    end : number
        Unix timestamp in seconds
    topic : str
        Topic to fetch offsets for

    Returns
    -------
    list of OffsetRange or None
        Per-partition ranges of offsets to read
    """
    consumer = kafka.KafkaConsumer(bootstrap_servers=brokers)
    partitions = consumer.partitions_for_topic(topic)
    if partitions is None:
        # Topic does not exist.
        return None
    partitions = [kafka.TopicPartition(topic, p) for p in partitions]
    o_start = offsets_for_times(consumer, partitions, start)
    o_end = offsets_for_times(consumer, partitions, end)
    return [OffsetRange(tp, o_start[tp], o_end[tp]) for tp in partitions] 
Example #8
Source File: inference_cache.py    From rafiki with Apache License 2.0 5 votes vote down vote up
def pop_queries_for_worker(self, worker_id: str, batch_size: int) -> List[Query]:
        name = f'workers_{worker_id}_queries'

        query_consumer = KafkaConsumer(name, bootstrap_servers=self.connection_url, auto_offset_reset='earliest', group_id=QUERIES_QUEUE)
        
        partition = TopicPartition(name, 0)
        partitiondic = query_consumer.end_offsets([partition])
        offsetend = partitiondic.get(partition, None)
        if offsetend == 0:
            query_consumer.close()
            return []
        try:
            queries = []
            while True:
                record = next(query_consumer)
                queries.append(record.value)
                query_consumer.commit()
                if record.offset >= offsetend-1 or len(queries) == batch_size:
                    break
                
            queries = [pickle.loads(x) for x in queries]
            query_consumer.close()
            return queries
        except KafkaError:
            query_consumer.close()
            return [] 
Example #9
Source File: kafka_utils.py    From fooltrader with MIT License 5 votes vote down vote up
def get_latest_timestamp_order_from_topic(topic):
    consumer = KafkaConsumer(topic,
                             # client_id='fooltrader',
                             # group_id='fooltrader',
                             value_deserializer=lambda m: json.loads(m.decode('utf8')),
                             bootstrap_servers=[KAFKA_HOST])
    topic_partition = TopicPartition(topic=topic, partition=0)
    end_offset = consumer.end_offsets([topic_partition])[topic_partition]
    if end_offset > 0:
        # partition  assigned after poll, and we could seek
        consumer.poll(5, 1)

        consumer.seek(topic_partition, end_offset - 1)
        message = consumer.poll(10000, 500)
        msgs = message[topic_partition]
        if len(msgs) > 0:
            record = msgs[-1]
            timestamp = to_timestamp(record.value['timestamp'])
            order = None
            if 'order' in record.value:
                order = record.value['order']
            return timestamp, order
    return None, None 
Example #10
Source File: kafka.py    From django-logpipe with ISC License 5 votes vote down vote up
def seek(self, consumer, topic, partition):
        KafkaOffset = apps.get_model(app_label='logpipe', model_name='KafkaOffset')
        tp = kafka.TopicPartition(topic=topic, partition=partition)
        try:
            obj = KafkaOffset.objects.get(topic=topic, partition=partition)
            logger.debug('Seeking to offset "%s" on topic "%s", partition "%s"' % (obj.offset, topic, partition))
            consumer.client.seek(tp, obj.offset)
        except KafkaOffset.DoesNotExist:
            logger.debug('Seeking to beginning of topic "%s", partition "%s"' % (topic, partition))
            consumer.client.seek_to_beginning(tp) 
Example #11
Source File: kafka.py    From django-logpipe with ISC License 5 votes vote down vote up
def _get_topic_partitions(self):
        p = []
        partitions = self.client.partitions_for_topic(self.topic_name)
        if not partitions:
            raise MissingTopicError('Could not find topic %s. Does it exist?' % self.topic_name)
        for partition in partitions:
            tp = kafka.TopicPartition(self.topic_name, partition=partition)
            p.append(tp)
        return p 
Example #12
Source File: msearch_daemon.py    From search-MjoLniR with MIT License 4 votes vote down vote up
def iter_records(self) -> Generator[Mapping, None, None]:
        consumer = kafka.KafkaConsumer(bootstrap_servers=self.brokers,
                                       group_id='mjolnir_msearch',
                                       enable_auto_commit=False,
                                       auto_offset_reset='latest',
                                       value_deserializer=lambda x: json.loads(x.decode('utf8')),
                                       api_version=mjolnir.kafka.BROKER_VERSION,
                                       # Msearch requests are relatively heavy at a few tens of ms each.
                                       # 50 requests at 50ms each gives us ~2.5s to process a batch. We
                                       # keep this low so kafka regularly gets re-pinged.
                                       max_poll_records=min(500, 50 * self.n_workers))
        consumer.subscribe([self.topic_work])
        try:
            last_commit = 0.0
            offset_commit_interval_sec = 60
            offsets = cast(Dict[kafka.TopicPartition, kafka.OffsetAndMetadata], dict())
            while self.load_monitor.is_below_threshold:
                now = time.monotonic()
                if offsets and now - last_commit > offset_commit_interval_sec:
                    consumer.commit_async(offsets)
                    last_commit = now
                    offsets = {}
                # By polling directly, rather than using the iter based api, we
                # have the opportunity to regularly re-check the load monitor
                # and transition out of the consuming state if needed.
                poll_response = consumer.poll(timeout_ms=60000)
                if not poll_response:
                    continue
                with Metric.PROCESS_BATCH.time():
                    for tp, records in poll_response.items():
                        for record in records:
                            self.load_monitor.notify()
                            yield record.value
                    # Wait for all the work to complete
                    self.work_queue.join()
                for tp, records in poll_response.items():
                    offsets[tp] = kafka.OffsetAndMetadata(records[-1].offset + 1, '')
                Metric.RECORDS_PROCESSED.inc(sum(len(x) for x in poll_response.values()))
        finally:
            if offsets:
                consumer.commit(offsets)
            consumer.close() 
Example #13
Source File: client.py    From search-MjoLniR with MIT License 4 votes vote down vote up
def kafka_to_rdd(sc, client_config, offset_ranges):
    """Read ranges of kafka partitions into an RDD.

    Parameters
    ----------
    sc : pyspark.SparkContext
    client_config : ClientConfig
    offset_ranges : list of OffsetRange
        List of topic partitions along with ranges to read. Start
        and end of range are inclusive.

    Returns
    -------
    pyspark.RDD
        Contents of the specified offset_ranges
    """
    def read_offset_range(offset_range):
        if offset_range.end <= offset_range.start:
            # Raise exception?
            return
        # After serialization round trip these fail an isinstance check.
        # re-instantiate so we have the expected thing.
        tp = kafka.TopicPartition(*offset_range.tp)
        consumer = kafka.KafkaConsumer(bootstrap_servers=client_config.brokers,
                                       value_deserializer=lambda x: json.loads(x.decode('utf8')))
        try:
            consumer.assign([tp])
            consumer.seek(tp, offset_range.start)
            while True:
                poll_response = consumer.poll(timeout_ms=10000)
                if poll_response and tp in poll_response:
                    for message in poll_response[tp]:
                        if message.offset > offset_range.end:
                            break
                        yield message.value
                if consumer.position(tp) >= offset_range.end:
                    break
        finally:
            consumer.close()

    return (
        # TODO: This isn't the same as assigning each offset_range to a separate
        # partition, but it doesn't seem like pyspark allows us to do that. Often
        # enough this seems to achieve the same thing, but without guarantees.
        sc.parallelize(offset_ranges, len(offset_ranges))
        .flatMap(read_offset_range)
    ) 
Example #14
Source File: bot.py    From fooltrader with MIT License 4 votes vote down vote up
def consume_topic_with_func(self, topic, func):
        consumer = KafkaConsumer(topic,
                                 client_id='fooltrader',
                                 group_id=self.bot_name,
                                 value_deserializer=lambda m: json.loads(m.decode('utf8')),
                                 bootstrap_servers=[KAFKA_HOST])
        topic_partition = TopicPartition(topic=topic, partition=0)

        if self.start_timestamp:
            start_timestamp = int(self.start_timestamp.timestamp() * 1000)

            end_offset = consumer.end_offsets([topic_partition])[topic_partition]
            if end_offset == 0:
                self.logger.warning("topic:{} end offset:{}".format(topic, end_offset))
                self.logger.error("the topic:{} has no data,but you want to backtest".format(self.quote_topic))
                return

            # find the offset from start_timestamp
            offset_and_timestamp = consumer.offsets_for_times({topic_partition: start_timestamp})

            if offset_and_timestamp:
                offset_and_timestamp = offset_and_timestamp[topic_partition]

                if offset_and_timestamp:
                    # partition  assigned after poll, and we could seek
                    consumer.poll(5, 1)
                    # move to the offset
                    consumer.seek(topic_partition, offset_and_timestamp.offset)

                    for message in consumer:
                        if 'timestamp' in message.value:
                            message_time = to_timestamp(message.value['timestamp'])
                        else:
                            message_time = to_timestamp(message.timestamp)

                        if self.end_timestamp and (message_time > self.end_timestamp):
                            consumer.close()
                            break

                        getattr(self, func)(message.value)

                else:
                    latest_timestamp, _ = get_latest_timestamp_order_from_topic(self.quote_topic)
                    self.logger.warning(
                        "start:{} is after the last record:{}".format(self.start_timestamp, latest_timestamp))