Python pysam.FastxFile() Examples

The following are 4 code examples of pysam.FastxFile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function

Example #1

Source File: eval_full.py From qcat with Mozilla Public License 2.0

6 votes

def create_reference(mapping, reference_file, fasta_path):
    ref_count = 0
    with open(reference_file, "w") as ref:
        for g in glob.glob(fasta_path):
            with pysam.FastxFile(g) as fh:
                id = os.path.splitext(os.path.basename(g))[0]
                for entry in fh:
                    if id in mapping:
                        ref_count += 1
                        name = ','.join([str(x) for x in mapping[id]])

                        print(">" + str(mapping[id][0]) + " " + entry.name,
                              entry.comment, file=ref)
                        print(entry.sequence, file=ref)
    if ref_count != len(mapping.keys()):
        raise RuntimeError(
            "Couldn't find all references, please check mappings!")
    return ref_count

Example #2

Source File: util.py From pomoxis with Mozilla Public License 2.0

5 votes

def split_fastx(fname, output, chunksize=10000):
    """Split records in a fasta/q into fixed lengths.

    :param fname: input filename.
    :param output: output filename.
    :param chunksize: (maximum) length of output records.
    """
    with open(output, 'w') as fout:
        with pysam.FastxFile(fname, persist=False) as fin:
            for rec in fin:
                name = rec.name
                seq = rec.sequence
                qual = rec.quality
                if rec.comment is None:
                    comment = 'chunk_length={}'.format(chunksize)
                else:
                    comment = '{} chunk_length={}'.format(rec.comment, chunksize)
                if qual is None:
                    for i, s in enumerate(chunks(seq, chunksize)):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write(">{} {}\n{}\n".format(
                            chunk_name, comment, ''.join(s)))
                else:
                    for i, (s, q) in enumerate(zip(chunks(seq, chunksize), chunks(qual, chunksize))):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write('@{} {}\n{}\n+\n{}\n'.format(
                            chunk_name, comment, ''.join(s), ''.join(q)))

Example #3

Source File: util.py From pomoxis with Mozilla Public License 2.0

5 votes

def get_seq_lens(fastx):
    """Get sequence lengths from fastx file"""
    return [len(r.sequence) for r in pysam.FastxFile(fastx)]

Example #4

Source File: smolecule.py From medaka with Mozilla Public License 2.0

4 votes

def multi_from_fastx(cls, fastx,
                         take_all=False, read_id=None, depth_filter=1,
                         length_filter=0):
        """Create multiple `Read` s from a fasta/q file.

        It is assumed that subreads are grouped by read and named with
        <read_id>_<subread_id>.

        :param fastx: input file path.
        :param take_all: skip check on subread_ids, take all subreads in one
            `Read`.
        :param read_id: name of `Read`. Only used for `take_all == True`. If
            not given the basename of the input file is used.
        :param depth_filter: require reads to have at least this many subreads.
        :param length_filter: require reads to have a median subread length
            above this value.

        """
        depth_filter = max(1, depth_filter)
        if take_all and read_id is None:
            read_id = os.path.splitext(os.path.basename(fastx))[0]
        else:
            read_id = None
        subreads = []
        with pysam.FastxFile(fastx) as fh:
            for entry in fh:
                if not take_all:
                    cur_read_id = entry.name.split("_")[0]
                    if cur_read_id != read_id:
                        if len(subreads) >= depth_filter:
                            med_length = np.median(
                                [len(x.seq) for x in subreads])
                            if med_length > length_filter:
                                yield cls(read_id, subreads)
                        read_id = cur_read_id
                        subreads = []
                if len(entry.sequence) > 0:
                    subreads.append(Subread(entry.name, entry.sequence))

            if len(subreads) >= depth_filter:
                med_length = np.median([len(x.seq) for x in subreads])
                if med_length > length_filter:
                    yield cls(read_id, subreads)