Python Examples of kafka.TopicPartition

Source File: client.py From search-MjoLniR with MIT License

7 votes

def offsets_for_times(consumer, partitions, timestamp):
    """Augment KafkaConsumer.offsets_for_times to not return None

    Parameters
    ----------
    consumer : kafka.KafkaConsumer
        This consumer must only be used for collecting metadata, and not
        consuming. API's will be used that invalidate consuming.
    partitions : list of kafka.TopicPartition
    timestamp : number
        Timestamp, in seconds since unix epoch, to return offsets for.

    Returns
    -------
    dict from kafka.TopicPartition to integer offset
    """
    # Kafka uses millisecond timestamps
    timestamp_ms = int(timestamp * 1000)
    response = consumer.offsets_for_times({p: timestamp_ms for p in partitions})
    offsets = {}
    for tp, offset_and_timestamp in response.items():
        if offset_and_timestamp is None:
            # No messages exist after timestamp. Fetch latest offset.
            consumer.assign([tp])
            consumer.seek_to_end(tp)
            offsets[tp] = consumer.position(tp)
        else:
            offsets[tp] = offset_and_timestamp.offset
    return offsets

Source File: test_kafka.py From py-timeexecution with Apache License 2.0

6 votes

def _query_backend(self):
        consumer = KafkaConsumer(
            bootstrap_servers=KAFKA_HOST, value_deserializer=lambda v: JSONSerializer().loads(v.decode('utf-8'))
        )

        tp = TopicPartition(self.topic, 0)
        consumer.assign([tp])

        count = consumer.position(tp)

        consumer.seek(tp, 0)

        metrics = []
        for i in range(count):
            metrics.append(next(consumer))

        return metrics

Source File: kafka_utils_class.py From warriorframework with Apache License 2.0

6 votes

def assign_partitions(self, partitions):
        """
        Assign partitions to consumer.
        Arguments:
          partitions(list) : list of [topic, partition] lists
            example : [[topic1,1], [topic2,1]]
        Returns:
            None.
        """
        print_info("assigning partitions to consumer {}".format(partitions))
        topic_partitions = [TopicPartition(topic=tup[0], partition=tup[1]) for tup in partitions]
        try:
            self.kafka_consumer.assign(topic_partitions)
            result = True
        except KafkaError as exc:
            print_error("Exception during assiging partitions - {}".format(exc))
            result = False
        return result

Source File: kafka_utils_class.py From warriorframework with Apache License 2.0

6 votes

def seek_to_position(self, topic, partition, offset):
        """
        Seek to the given offset.
        Arguments:
          topic(str): topic name
          partition(int): partition number
          offset(int): offset number
        Returns:
          result(bool) : False if exception occures, True otherwise
        """
        print_info("seeking to position {}:{}:{}".format(topic, partition, offset))
        topic_partition = TopicPartition(topic=topic, partition=partition)
        try:
            self.kafka_consumer.seek(partition=topic_partition, offset=offset)
            result = True
        except KafkaError as exc:
            print_error("Exception during seek - {}".format(exc))
            result = False
        return result

Source File: messaging.py From Ad-Insertion-Sample with BSD 3-Clause "New" or "Revised" License

6 votes

def debug(self, topic):
        c=KafkaConsumer(bootstrap_servers=kafka_hosts, client_id=self._client_id , group_id=None, api_version=(0,10))

        # assign/subscribe topic
        partitions=c.partitions_for_topic(topic)
        if not partitions: raise Exception("Topic "+topic+" not exist")
        c.assign([TopicPartition(topic,p) for p in partitions])

        # seek to beginning if needed
        c.seek_to_beginning()

        # fetch messages
        while True:
            partitions=c.poll(100)
            if partitions:
                for p in partitions:
                    for msg in partitions[p]:
                        yield msg.value.decode('utf-8')
            yield ""

        c.close()

Source File: test_client.py From search-MjoLniR with MIT License

5 votes

def test_offset_for_times(mocker):
    partitions = [kafka.TopicPartition('ut_topic', 0)]
    offsets_for_times = {tp: OffsetAndTimestamp(42, -1) for tp in partitions}
    positions = {tp: 747 for tp in partitions}

    mock = mocker.Mock()
    mock.offsets_for_times.return_value = offsets_for_times
    mock.position.side_effect = lambda tp: positions.get(tp, 0)

    # Uses returned offset for time when provided
    offsets = client.offsets_for_times(mock, partitions, 987654321)
    assert len(offsets) == len(partitions)
    assert all(tp in offsets for tp in partitions)
    assert offsets[partitions[0]] == 42

    # When offsets_for_times returns None returns position at end
    offsets_for_times[partitions[0]] = None
    offsets = client.offsets_for_times(mock, partitions, 987654321)
    assert len(offsets) == len(partitions)
    assert all(tp in offsets for tp in partitions)
    assert offsets[partitions[0]] == 747

Source File: client.py From search-MjoLniR with MIT License

5 votes

def offset_range_for_timestamp_range(brokers, start, end, topic):
    """Determine OffsetRange for a given timestamp range

    Parameters
    ----------
    client_config : ClientConfig
    start : number
        Unix timestamp in seconds
    end : number
        Unix timestamp in seconds
    topic : str
        Topic to fetch offsets for

    Returns
    -------
    list of OffsetRange or None
        Per-partition ranges of offsets to read
    """
    consumer = kafka.KafkaConsumer(bootstrap_servers=brokers)
    partitions = consumer.partitions_for_topic(topic)
    if partitions is None:
        # Topic does not exist.
        return None
    partitions = [kafka.TopicPartition(topic, p) for p in partitions]
    o_start = offsets_for_times(consumer, partitions, start)
    o_end = offsets_for_times(consumer, partitions, end)
    return [OffsetRange(tp, o_start[tp], o_end[tp]) for tp in partitions]

Source File: inference_cache.py From rafiki with Apache License 2.0

5 votes

def pop_queries_for_worker(self, worker_id: str, batch_size: int) -> List[Query]:
        name = f'workers_{worker_id}_queries'

        query_consumer = KafkaConsumer(name, bootstrap_servers=self.connection_url, auto_offset_reset='earliest', group_id=QUERIES_QUEUE)
        
        partition = TopicPartition(name, 0)
        partitiondic = query_consumer.end_offsets([partition])
        offsetend = partitiondic.get(partition, None)
        if offsetend == 0:
            query_consumer.close()
            return []
        try:
            queries = []
            while True:
                record = next(query_consumer)
                queries.append(record.value)
                query_consumer.commit()
                if record.offset >= offsetend-1 or len(queries) == batch_size:
                    break
                
            queries = [pickle.loads(x) for x in queries]
            query_consumer.close()
            return queries
        except KafkaError:
            query_consumer.close()
            return []

Source File: kafka_utils.py From fooltrader with MIT License

5 votes

def get_latest_timestamp_order_from_topic(topic):
    consumer = KafkaConsumer(topic,
                             # client_id='fooltrader',
                             # group_id='fooltrader',
                             value_deserializer=lambda m: json.loads(m.decode('utf8')),
                             bootstrap_servers=[KAFKA_HOST])
    topic_partition = TopicPartition(topic=topic, partition=0)
    end_offset = consumer.end_offsets([topic_partition])[topic_partition]
    if end_offset > 0:
        # partition  assigned after poll, and we could seek
        consumer.poll(5, 1)

        consumer.seek(topic_partition, end_offset - 1)
        message = consumer.poll(10000, 500)
        msgs = message[topic_partition]
        if len(msgs) > 0:
            record = msgs[-1]
            timestamp = to_timestamp(record.value['timestamp'])
            order = None
            if 'order' in record.value:
                order = record.value['order']
            return timestamp, order
    return None, None

Source File: kafka.py From django-logpipe with ISC License

5 votes

def seek(self, consumer, topic, partition):
        KafkaOffset = apps.get_model(app_label='logpipe', model_name='KafkaOffset')
        tp = kafka.TopicPartition(topic=topic, partition=partition)
        try:
            obj = KafkaOffset.objects.get(topic=topic, partition=partition)
            logger.debug('Seeking to offset "%s" on topic "%s", partition "%s"' % (obj.offset, topic, partition))
            consumer.client.seek(tp, obj.offset)
        except KafkaOffset.DoesNotExist:
            logger.debug('Seeking to beginning of topic "%s", partition "%s"' % (topic, partition))
            consumer.client.seek_to_beginning(tp)

Source File: kafka.py From django-logpipe with ISC License

5 votes

def _get_topic_partitions(self):
        p = []
        partitions = self.client.partitions_for_topic(self.topic_name)
        if not partitions:
            raise MissingTopicError('Could not find topic %s. Does it exist?' % self.topic_name)
        for partition in partitions:
            tp = kafka.TopicPartition(self.topic_name, partition=partition)
            p.append(tp)
        return p

Source File: msearch_daemon.py From search-MjoLniR with MIT License

4 votes

def iter_records(self) -> Generator[Mapping, None, None]:
        consumer = kafka.KafkaConsumer(bootstrap_servers=self.brokers,
                                       group_id='mjolnir_msearch',
                                       enable_auto_commit=False,
                                       auto_offset_reset='latest',
                                       value_deserializer=lambda x: json.loads(x.decode('utf8')),
                                       api_version=mjolnir.kafka.BROKER_VERSION,
                                       # Msearch requests are relatively heavy at a few tens of ms each.
                                       # 50 requests at 50ms each gives us ~2.5s to process a batch. We
                                       # keep this low so kafka regularly gets re-pinged.
                                       max_poll_records=min(500, 50 * self.n_workers))
        consumer.subscribe([self.topic_work])
        try:
            last_commit = 0.0
            offset_commit_interval_sec = 60
            offsets = cast(Dict[kafka.TopicPartition, kafka.OffsetAndMetadata], dict())
            while self.load_monitor.is_below_threshold:
                now = time.monotonic()
                if offsets and now - last_commit > offset_commit_interval_sec:
                    consumer.commit_async(offsets)
                    last_commit = now
                    offsets = {}
                # By polling directly, rather than using the iter based api, we
                # have the opportunity to regularly re-check the load monitor
                # and transition out of the consuming state if needed.
                poll_response = consumer.poll(timeout_ms=60000)
                if not poll_response:
                    continue
                with Metric.PROCESS_BATCH.time():
                    for tp, records in poll_response.items():
                        for record in records:
                            self.load_monitor.notify()
                            yield record.value
                    # Wait for all the work to complete
                    self.work_queue.join()
                for tp, records in poll_response.items():
                    offsets[tp] = kafka.OffsetAndMetadata(records[-1].offset + 1, '')
                Metric.RECORDS_PROCESSED.inc(sum(len(x) for x in poll_response.values()))
        finally:
            if offsets:
                consumer.commit(offsets)
            consumer.close()

Source File: client.py From search-MjoLniR with MIT License

4 votes

def kafka_to_rdd(sc, client_config, offset_ranges):
    """Read ranges of kafka partitions into an RDD.

    Parameters
    ----------
    sc : pyspark.SparkContext
    client_config : ClientConfig
    offset_ranges : list of OffsetRange
        List of topic partitions along with ranges to read. Start
        and end of range are inclusive.

    Returns
    -------
    pyspark.RDD
        Contents of the specified offset_ranges
    """
    def read_offset_range(offset_range):
        if offset_range.end <= offset_range.start:
            # Raise exception?
            return
        # After serialization round trip these fail an isinstance check.
        # re-instantiate so we have the expected thing.
        tp = kafka.TopicPartition(*offset_range.tp)
        consumer = kafka.KafkaConsumer(bootstrap_servers=client_config.brokers,
                                       value_deserializer=lambda x: json.loads(x.decode('utf8')))
        try:
            consumer.assign([tp])
            consumer.seek(tp, offset_range.start)
            while True:
                poll_response = consumer.poll(timeout_ms=10000)
                if poll_response and tp in poll_response:
                    for message in poll_response[tp]:
                        if message.offset > offset_range.end:
                            break
                        yield message.value
                if consumer.position(tp) >= offset_range.end:
                    break
        finally:
            consumer.close()

    return (
        # TODO: This isn't the same as assigning each offset_range to a separate
        # partition, but it doesn't seem like pyspark allows us to do that. Often
        # enough this seems to achieve the same thing, but without guarantees.
        sc.parallelize(offset_ranges, len(offset_ranges))
        .flatMap(read_offset_range)
    )

Source File: bot.py From fooltrader with MIT License

4 votes

def consume_topic_with_func(self, topic, func):
        consumer = KafkaConsumer(topic,
                                 client_id='fooltrader',
                                 group_id=self.bot_name,
                                 value_deserializer=lambda m: json.loads(m.decode('utf8')),
                                 bootstrap_servers=[KAFKA_HOST])
        topic_partition = TopicPartition(topic=topic, partition=0)

        if self.start_timestamp:
            start_timestamp = int(self.start_timestamp.timestamp() * 1000)

            end_offset = consumer.end_offsets([topic_partition])[topic_partition]
            if end_offset == 0:
                self.logger.warning("topic:{} end offset:{}".format(topic, end_offset))
                self.logger.error("the topic:{} has no data,but you want to backtest".format(self.quote_topic))
                return

            # find the offset from start_timestamp
            offset_and_timestamp = consumer.offsets_for_times({topic_partition: start_timestamp})

            if offset_and_timestamp:
                offset_and_timestamp = offset_and_timestamp[topic_partition]

                if offset_and_timestamp:
                    # partition  assigned after poll, and we could seek
                    consumer.poll(5, 1)
                    # move to the offset
                    consumer.seek(topic_partition, offset_and_timestamp.offset)

                    for message in consumer:
                        if 'timestamp' in message.value:
                            message_time = to_timestamp(message.value['timestamp'])
                        else:
                            message_time = to_timestamp(message.timestamp)

                        if self.end_timestamp and (message_time > self.end_timestamp):
                            consumer.close()
                            break

                        getattr(self, func)(message.value)

                else:
                    latest_timestamp, _ = get_latest_timestamp_order_from_topic(self.quote_topic)
                    self.logger.warning(
                        "start:{} is after the last record:{}".format(self.start_timestamp, latest_timestamp))

Python kafka.TopicPartition() Examples