diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 99ea1576..ab16c602 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -3,31 +3,59 @@ from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup -from libc.stdio cimport FILE, printf from posix.types cimport off_t -cdef extern from "Python.h": - FILE* PyFile_AsFile(object) - # cython does not wrap stdarg -cdef extern from "stdarg.h": +cdef extern from "": + ctypedef struct va_list: pass - + +# See the corresponding htslib/*.h headers for documentation of these +# HTSlib API functions. + + cdef extern from "htslib/kstring.h" nogil: + ctypedef struct kstring_t: size_t l, m char *s + int kvsprintf(kstring_t *s, const char *fmt, va_list ap) + int ksprintf(kstring_t *s, const char *fmt, ...) + int kputd(double d, kstring_t *s) + + ctypedef char *kgets_func(char *, int, void *) + int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) + + ctypedef ssize_t kgets_func2(char *, size_t, void *) + int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp) + + ks_initialize(kstring_t *s) + int ks_resize(kstring_t *s, size_t size) + int ks_expand(kstring_t *s, size_t expansion) + char *ks_str(kstring_t *s) + const char *ks_c_str(kstring_t *s) + size_t ks_len(kstring_t *s) + kstring_t *ks_clear(kstring_t *s) + char *ks_release(kstring_t *s) + void ks_free(kstring_t *s) + + int kputsn(const char *p, size_t l, kstring_t *s) + int kputs(const char *p, kstring_t *s) int kputc(int c, kstring_t *s) + int kputc_(int c, kstring_t *s) + int kputsn_(const void *p, size_t l, kstring_t *s) + int kputuw(unsigned x, kstring_t *s) int kputw(int c, kstring_t *s) + int kputll(long long c, kstring_t *s) int kputl(long c, kstring_t *s) - int ksprintf(kstring_t *s, const char *fmt, ...) cdef extern from "htslib_util.h" nogil: + int hts_set_verbosity(int verbosity) int hts_get_verbosity() @@ -62,134 +90,52 @@ cdef extern from "htslib_util.h" nogil: cdef extern from "htslib/hfile.h" nogil: + ctypedef struct hFILE - # @abstract Open the named file or URL as a stream - # @return An hFILE pointer, or NULL (with errno set) if an error occurred. hFILE *hopen(const char *filename, const char *mode, ...) - - # @abstract Associate a stream with an existing open file descriptor - # @return An hFILE pointer, or NULL (with errno set) if an error occurred. - # @notes For socket descriptors (on Windows), mode should contain 's'. hFILE *hdopen(int fd, const char *mode) - - # @abstract Report whether the file name or URL denotes remote storage - # @return 0 if local, 1 if remote. - # @notes "Remote" means involving e.g. explicit network access, with the - # implication that callers may wish to cache such files' contents locally. int hisremote(const char *filename) - - # @abstract Flush (for output streams) and close the stream - # @return 0 if successful, or EOF (with errno set) if an error occurred. + char *haddextension(kstring_t *buffer, const char *filename, int replace, const char *extension) int hclose(hFILE *fp) - - # @abstract Close the stream, without flushing or propagating errors - # @notes For use while cleaning up after an error only. Preserves errno. void hclose_abruptly(hFILE *fp) - # @abstract Return the stream's error indicator - # @return Non-zero (in fact, an errno value) if an error has occurred. - # @notes This would be called herror() and return true/false to parallel - # ferror(3), but a networking-related herror(3) function already exists. */ int herrno(hFILE *fp) - - # @abstract Clear the stream's error indicator void hclearerr(hFILE *fp) - # @abstract Reposition the read/write stream offset - # @return The resulting offset within the stream (as per lseek(2)), - # or negative if an error occurred. off_t hseek(hFILE *fp, off_t offset, int whence) - - # @abstract Report the current stream offset - # @return The offset within the stream, starting from zero. off_t htell(hFILE *fp) - - # @abstract Read one character from the stream - # @return The character read, or EOF on end-of-file or error int hgetc(hFILE *fp) - - # Read from the stream until the delimiter, up to a maximum length - # @param buffer The buffer into which bytes will be written - # @param size The size of the buffer - # @param delim The delimiter (interpreted as an `unsigned char`) - # @param fp The file stream - # @return The number of bytes read, or negative on error. - # @since 1.4 - # - # Bytes will be read into the buffer up to and including a delimiter, until - # EOF is reached, or _size-1_ bytes have been written, whichever comes first. - # The string will then be terminated with a NUL byte (`\0`). ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) - - # Read a line from the stream, up to a maximum length - # @param buffer The buffer into which bytes will be written - # @param size The size of the buffer - # @param fp The file stream - # @return The number of bytes read, or negative on error. - # @since 1.4 - # - # Specialization of hgetdelim() for a `\n` delimiter. ssize_t hgetln(char *buffer, size_t size, hFILE *fp) - - # Read a line from the stream, up to a maximum length - # @param buffer The buffer into which bytes will be written - # @param size The size of the buffer (must be > 1 to be useful) - # @param fp The file stream - # @return _buffer_ on success, or `NULL` if an error occurred. - # @since 1.4 - # - # This function can be used as a replacement for `fgets(3)`, or together with - # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. char *hgets(char *buffer, int size, hFILE *fp) - - # @abstract Peek at characters to be read without removing them from buffers - # @param fp The file stream - # @param buffer The buffer to which the peeked bytes will be written - # @param nbytes The number of bytes to peek at; limited by the size of the - # internal buffer, which could be as small as 4K. - # @return The number of bytes peeked, which may be less than nbytes if EOF - # is encountered; or negative, if there was an I/O error. - # @notes The characters peeked at remain in the stream's internal buffer, - # and will be returned by later hread() etc calls. ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) - - # @abstract Read a block of characters from the file - # @return The number of bytes read, or negative if an error occurred. - # @notes The full nbytes requested will be returned, except as limited - # by EOF or I/O errors. ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) - - # @abstract Write a character to the stream - # @return The character written, or EOF if an error occurred. int hputc(int c, hFILE *fp) - - # @abstract Write a string to the stream - # @return 0 if successful, or EOF if an error occurred. int hputs(const char *text, hFILE *fp) - - # @abstract Write a block of characters to the file - # @return Either nbytes, or negative if an error occurred. - # @notes In the absence of I/O errors, the full nbytes will be written. ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) - - # @abstract For writing streams, flush buffered output to the underlying stream - # @return 0 if successful, or EOF if an error occurred. int hflush(hFILE *fp) + char *hfile_mem_get_buffer(hFILE *file, size_t *length) + char *hfile_mem_steal_buffer(hFILE *file, size_t *length) + int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) + int hfile_list_plugins(const char *plist[], int *nplugins) + int hfile_has_plugin(const char *name) cdef extern from "htslib/bgzf.h" nogil: + ctypedef struct bgzf_mtaux_t ctypedef struct bgzidx_t - ctypedef struct z_stream - - ctypedef struct BGZF: - unsigned errcode - unsigned is_write - int is_be - int compress_level - int is_compressed - int is_gzip + ctypedef struct hts_tpool + ctypedef struct z_stream_s + + ctypedef struct BGZF: # struct will be made opaque in future + unsigned errcode + unsigned is_write + unsigned is_be + signed compress_level + unsigned is_compressed + unsigned is_gzip int cache_size int64_t block_address int64_t uncompressed_address @@ -200,256 +146,88 @@ cdef extern from "htslib/bgzf.h" nogil: bgzf_mtaux_t *mt bgzidx_t *idx int idx_build_otf - z_stream *gz_stream - - #***************** - # Basic routines * - # *****************/ - - # Open an existing file descriptor for reading or writing. - # - # @param fd file descriptor - # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for - # writing, 'a' for appending, 'g' for gzip rather than BGZF - # compression (with 'w' only), and digit specifies the zlib - # compression level. - # Note that there is a distinction between 'u' and '0': the - # first yields plain uncompressed output whereas the latter - # outputs uncompressed data wrapped in the zlib format. - # @return BGZF file handler; 0 on error - - BGZF* bgzf_dopen(int fd, const char *mode) - BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility - - # Open the specified file for reading or writing. - BGZF* bgzf_open(const char* path, const char *mode) - - # Open an existing hFILE stream for reading or writing. - BGZF* bgzf_hopen(hFILE *fp, const char *mode) - - # Close the BGZF and free all associated resources. - # - # @param fp BGZF file handler - # @return 0 on success and -1 on error + z_stream_s *gz_stream + + BGZF *bgzf_dopen(int fd, const char *mode) + BGZF *bgzf_fdopen(int fd, const char *mode) # for backward compatibility + BGZF *bgzf_open(const char *path, const char *mode) + BGZF *bgzf_hopen(hFILE *fp, const char *mode) int bgzf_close(BGZF *fp) - # Read up to _length_ bytes from the file storing into _data_. - # - # @param fp BGZF file handler - # @param data data array to read into - # @param length size of data to read - # @return number of bytes actually read; 0 on end-of-file and -1 on error ssize_t bgzf_read(BGZF *fp, void *data, size_t length) - - # Write _length_ bytes from _data_ to the file. If no I/O errors occur, - # the complete _length_ bytes will be written (or queued for writing). - # - # @param fp BGZF file handler - # @param data data array to write - # @param length size of data to write - # @return number of bytes written (i.e., _length_); negative on error + ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) - - # Read up to _length_ bytes directly from the underlying stream without - # decompressing. Bypasses BGZF blocking, so must be used with care in - # specialised circumstances only. - # - # @param fp BGZF file handler - # @param data data array to read into - # @param length number of raw bytes to read - # @return number of bytes actually read; 0 on end-of-file and -1 on error + ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) + ssize_t bgzf_block_write(BGZF *fp, const void *data, size_t length) + int bgzf_peek(BGZF *fp) ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) - - # Write _length_ bytes directly to the underlying stream without - # compressing. Bypasses BGZF blocking, so must be used with care - # in specialised circumstances only. - # - # @param fp BGZF file handler - # @param data data array to write - # @param length number of raw bytes to write - # @return number of bytes actually written; -1 on error ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) - - # Write the data in the buffer to the file. int bgzf_flush(BGZF *fp) - - # Return a virtual file pointer to the current location in the file. - # No interpretation of the value should be made, other than a subsequent - # call to bgzf_seek can be used to position the file at the same point. - # Return value is non-negative on success. - int64_t bgzf_tell(BGZF *fp) - - # Set the file to read from the location specified by _pos_. - # - # @param fp BGZF file handler - # @param pos virtual file offset returned by bgzf_tell() - # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd) - # @return 0 on success and -1 on error - # / + int64_t bgzf_tell(const BGZF *fp) int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) - # Check if the BGZF end-of-file (EOF) marker is present - # - # @param fp BGZF file handler opened for reading - # @return 1 if the EOF marker is present and correct - # 2 if it can't be checked, e.g., because fp isn't seekable - # 0 if the EOF marker is absent - # -1 (with errno set) on error int bgzf_check_EOF(BGZF *fp) + int bgzf_compression(BGZF *fp) + int bgzf_is_bgzf(const char *fn) # deprecated - # Check if a file is in the BGZF format - # - # @param fn file name - # @return 1 if _fn_ is BGZF; 0 if not or on I/O error - int bgzf_is_bgzf(const char *fn) - - #********************* - # Advanced routines * - #********************* - - # Set the cache size. Only effective when compiled with -DBGZF_CACHE. - # - # @param fp BGZF file handler - # @param size size of cache in bytes; 0 to disable caching (default) void bgzf_set_cache_size(BGZF *fp, int size) - - # Flush the file if the remaining buffer size is smaller than _size_ - # @return 0 if flushing succeeded or was not needed; negative on error int bgzf_flush_try(BGZF *fp, ssize_t size) - - # Read one byte from a BGZF file. It is faster than bgzf_read() - # @param fp BGZF file handler - # @return byte read; -1 on end-of-file or error int bgzf_getc(BGZF *fp) - - # Read one line from a BGZF file. It is faster than bgzf_getc() - # - # @param fp BGZF file handler - # @param delim delimiter - # @param str string to write to; must be initialized - # @return length of the string; 0 on end-of-file; negative on error int bgzf_getline(BGZF *fp, int delim, kstring_t *str) - - # Read the next BGZF block. int bgzf_read_block(BGZF *fp) - - # Enable multi-threading (only effective on writing and when the - # library was compiled with -DBGZF_MT) - # - # @param fp BGZF file handler; must be opened for writing - # @param n_threads #threads used for writing - # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + int bgzf_thread_pool(BGZF *fp, hts_tpool *pool, int qsize) int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) - - - # Compress a single BGZF block. - # - # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) - # @param dlen size of output buffer; updated on return to the number - # of bytes actually written to dst - # @param src buffer to be compressed - # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) - # @param level compression level - # @return 0 on success and negative on error - # int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) - #******************* - # bgzidx routines * - # BGZF at the uncompressed offset - # - # @param fp BGZF file handler; must be opened for reading - # @param uoffset file offset in the uncompressed data - # @param where SEEK_SET (cimported from libc.stdio) supported atm - # - # Returns 0 on success and -1 on error. - int bgzf_useek(BGZF *fp, long uoffset, int where) - - # Position in uncompressed BGZF - # - # @param fp BGZF file handler; must be opened for reading - # - # Returns the current offset on success and -1 on error. - long bgzf_utell(BGZF *fp) - - # Tell BGZF to build index while compressing. - # - # @param fp BGZF file handler; can be opened for reading or writing. - # - # Returns 0 on success and -1 on error. - int bgzf_index_build_init(BGZF *fp) + int bgzf_useek(BGZF *fp, off_t uoffset, int where) + off_t bgzf_utell(BGZF *fp) - # Load BGZF index - # - # @param fp BGZF file handler - # @param bname base name - # @param suffix suffix to add to bname (can be NULL) - # - # Returns 0 on success and -1 on error. + int bgzf_index_build_init(BGZF *fp) int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) - - # Save BGZF index - # - # @param fp BGZF file handler - # @param bname base name - # @param suffix suffix to add to bname (can be NULL) - # - # Returns 0 on success and -1 on error. + int bgzf_index_load_hfile(BGZF *fp, hFILE *idx, const char *name) int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) + int bgzf_index_dump_hfile(BGZF *fp, hFILE *idx, const char *name) cdef extern from "htslib/hts.h" nogil: + + # Actually provided by htslib/hts_log.h since HTSlib 1.5, but #included from htslib/hts.h too + int hts_verbose + + # Actually provided by htslib/kroundup.h since HTSlib 1.11, but #included from htslib/hts.h too + uint64_t kroundup64(uint64_t x) uint32_t kroundup32(uint32_t x) + size_t kroundup_size_t(size_t x) + + char HTS_PATH_SEPARATOR_CHAR + char HTS_PATH_SEPARATOR_STR ctypedef struct cram_fd - union FilePointerUnion: - BGZF *bgzf - cram_fd *cram - hFILE *hfile - void *voidp + void hts_lib_shutdown() + void hts_free(void *ptr) - enum htsFormatCategory: + cdef enum htsFormatCategory: unknown_category - sequence_data # Sequence data -- SAM, BAM, CRAM, etc - variant_data # Variant calling data -- VCF, BCF, etc - index_file # Index file associated with some data file - region_list # Coordinate intervals or regions -- BED, etc - category_maximum + sequence_data + variant_data + index_file + region_list - enum htsExactFormat: + cdef enum htsExactFormat: unknown_format - binary_format - text_format + binary_format, text_format sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed - format_maximum - - enum htsCompression: - no_compression, gzip, bgzf, custom - compression_maximum + htsget + # json # replaced by `htsget`; conflicts with `import json` in Python + empty_format + fasta_format, fastq_format, fai_format, fqi_format + hts_crypt4gh_format + d4_format - cdef enum hts_fmt_option: - CRAM_OPT_DECODE_MD, - CRAM_OPT_PREFIX, - CRAM_OPT_VERBOSITY, - CRAM_OPT_SEQS_PER_SLICE, - CRAM_OPT_SLICES_PER_CONTAINER, - CRAM_OPT_RANGE, - CRAM_OPT_VERSION, - CRAM_OPT_EMBED_REF, - CRAM_OPT_IGNORE_MD5, - CRAM_OPT_REFERENCE, - CRAM_OPT_MULTI_SEQ_PER_SLICE, - CRAM_OPT_NO_REF, - CRAM_OPT_USE_BZIP2, - CRAM_OPT_SHARED_REF, - CRAM_OPT_NTHREADS, - CRAM_OPT_THREAD_POOL, - CRAM_OPT_USE_LZMA, - CRAM_OPT_USE_RANS, - CRAM_OPT_REQUIRED_FIELDS, - HTS_OPT_COMPRESSION_LEVEL, - HTS_OPT_NTHREADS, + cdef enum htsCompression: + no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression + xz_compression, zstd_compression ctypedef struct htsVersion: short major, minor @@ -460,21 +238,91 @@ cdef extern from "htslib/hts.h" nogil: htsVersion version htsCompression compression short compression_level - void *specific + void *specific - ctypedef struct htsFile: - uint8_t is_bin - uint8_t is_write - uint8_t is_be - uint8_t is_cram + ctypedef struct hts_idx_t + + cdef union htsFile_fp_union: + BGZF *bgzf + cram_fd *cram + hFILE *hfile + + ctypedef struct htsFile: # struct will be made opaque in future + uint8_t is_bin, is_write, is_be, is_cram, is_bgzf int64_t lineno kstring_t line char *fn char *fn_aux - FilePointerUnion fp + htsFile_fp_union fp htsFormat format - int hts_verbose + ctypedef struct htsThreadPool: + hts_tpool *pool + int qsize + + int SAM_QNAME + int SAM_FLAG + int SAM_RNAME + int SAM_POS + int SAM_MAPQ + int SAM_CIGAR + int SAM_RNEXT + int SAM_PNEXT + int SAM_TLEN + int SAM_SEQ + int SAM_QUAL + int SAM_AUX + int SAM_RGAUX + + cdef enum hts_fmt_option: + CRAM_OPT_DECODE_MD + CRAM_OPT_PREFIX + CRAM_OPT_VERBOSITY + CRAM_OPT_SEQS_PER_SLICE + CRAM_OPT_SLICES_PER_CONTAINER + CRAM_OPT_RANGE + CRAM_OPT_VERSION + CRAM_OPT_EMBED_REF + CRAM_OPT_IGNORE_MD5 + CRAM_OPT_REFERENCE + CRAM_OPT_MULTI_SEQ_PER_SLICE + CRAM_OPT_NO_REF + CRAM_OPT_USE_BZIP2 + CRAM_OPT_SHARED_REF + CRAM_OPT_NTHREADS + CRAM_OPT_THREAD_POOL + CRAM_OPT_USE_LZMA + CRAM_OPT_USE_RANS + CRAM_OPT_REQUIRED_FIELDS + CRAM_OPT_LOSSY_NAMES + CRAM_OPT_BASES_PER_SLICE + CRAM_OPT_STORE_MD + CRAM_OPT_STORE_NM + CRAM_OPT_RANGE_NOSEEK + CRAM_OPT_USE_TOK + CRAM_OPT_USE_FQZ + CRAM_OPT_USE_ARITH + CRAM_OPT_POS_DELTA + + HTS_OPT_COMPRESSION_LEVEL + HTS_OPT_NTHREADS + HTS_OPT_THREAD_POOL + HTS_OPT_CACHE_SIZE + HTS_OPT_BLOCK_SIZE + HTS_OPT_FILTER + HTS_OPT_PROFILE + + FASTQ_OPT_CASAVA + FASTQ_OPT_AUX + FASTQ_OPT_RNUM + FASTQ_OPT_BARCODE + FASTQ_OPT_NAME2 + + cdef enum hts_profile_option: + HTS_PROFILE_FAST + HTS_PROFILE_NORMAL + HTS_PROFILE_SMALL + HTS_PROFILE_ARCHIVE cdef union hts_opt_val_union: int i @@ -484,147 +332,62 @@ cdef extern from "htslib/hts.h" nogil: char *arg hts_fmt_option opt hts_opt_val_union val - void *next + hts_opt *next - # @abstract Parses arg and appends it to the option list. - # @return 0 on success and -1 on failure int hts_opt_add(hts_opt **opts, const char *c_arg) - - # @abstract Applies an hts_opt option list to a given htsFile. - # @return 0 on success and -1 on failure int hts_opt_apply(htsFile *fp, hts_opt *opts) - - # @abstract Frees an hts_opt list. void hts_opt_free(hts_opt *opts) + int hts_parse_format(htsFormat *opt, const char *str) + int hts_parse_opt_list(htsFormat *opt, const char *str) - # @abstract Table for converting a nucleotide character to 4-bit encoding. - # The input character may be either an IUPAC ambiguity code, '=' for 0, or - # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 - # for A/C/G/T or combinations of these bits for ambiguous bases. - const unsigned char *seq_nt16_table + const unsigned char seq_nt16_table[] + const char seq_nt16_str[] + const int seq_nt16_int[] - # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC - # ambiguity code letter (or '=' when given 0). - const char *seq_nt16_str - - # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. - # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). - const int *seq_nt16_int - - # @abstract Get the htslib version number - # @return For released versions, a string like "N.N[.N]"; or git describe - # output if using a library built within a Git repository. const char *hts_version() + int HTS_VERSION + unsigned int hts_features() + const char *hts_test_feature(unsigned int id) + const char *hts_feature_string() + + int HTS_FEATURE_CONFIGURE + int HTS_FEATURE_PLUGINS + int HTS_FEATURE_LIBCURL + int HTS_FEATURE_S3 + int HTS_FEATURE_GCS + int HTS_FEATURE_LIBDEFLATE + int HTS_FEATURE_LZMA + int HTS_FEATURE_BZIP2 + int HTS_FEATURE_HTSCODECS + int HTS_FEATURE_CC + int HTS_FEATURE_CFLAGS + int HTS_FEATURE_CPPFLAGS + int HTS_FEATURE_LDFLAGS - # @abstract Determine format by peeking at the start of a file - # @param fp File opened for reading, positioned at the beginning - # @param fmt Format structure that will be filled out on return - # @return 0 for success, or negative if an error occurred. int hts_detect_format(hFILE *fp, htsFormat *fmt) - - # @abstract Get a human-readable description of the file format - # @return Description string, to be freed by the caller after use. + int hts_detect_format2(hFILE *fp, const char *fname, htsFormat *fmt) char *hts_format_description(const htsFormat *format) - # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fn The file name or "-" for stdin/stdout - # @param mode Mode matching / [rwa][bceguxz0-9]* / - # @discussion - # With 'r' opens for reading; any further format mode letters are ignored - # as the format is detected by checking the first few bytes or BGZF blocks - # of the file. With 'w' or 'a' opens for writing or appending, with format - # specifier letters: - # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) - # c CRAM format - # g gzip compressed - # u uncompressed - # z bgzf compressed - # [0-9] zlib compression level - # and with non-format option letters (for any of 'r'/'w'/'a'): - # e close the file on exec(2) (opens with O_CLOEXEC, where supported) - # x create the file exclusively (opens with O_EXCL, where supported) - # Note that there is a distinction between 'u' and '0': the first yields - # plain uncompressed output whereas the latter outputs uncompressed data - # wrapped in the zlib format. - # @example - # [rw]b .. compressed BCF, BAM, FAI - # [rw]bu .. uncompressed BCF - # [rw]z .. compressed VCF - # [rw] .. uncompressed VCF htsFile *hts_open(const char *fn, const char *mode) - - # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fn The file name or "-" for stdin/stdout - # @param mode Open mode, as per hts_open() - # @param fmt Optional format specific parameters - # @discussion - # See hts_open() for description of fn and mode. - # // TODO Update documentation for s/opts/fmt/ - # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, - # if defined, override mode. Opts also contains a linked list of hts_opt - # structures to apply to the open file handle. These can contain things - # like pointers to the reference or information on compression levels, - # block sizes, etc. htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) - - # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fp The already-open file handle - # @param fn The file name or "-" for stdin/stdout - # @param mode Open mode, as per hts_open() htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) - - # @abstract For output streams, flush any buffered data - # @param fp The file handle to be flushed - # @return 0 for success, or negative if an error occurred. - # @since 1.14 int hts_flush(htsFile *fp) - - # @abstract Close a file handle, flushing buffered data for output streams - # @param fp The file handle to be closed - # @return 0 for success, or negative if an error occurred. int hts_close(htsFile *fp) - # @abstract Returns the file's format information - # @param fp The file handle - # @return Read-only pointer to the file's htsFormat. const htsFormat *hts_get_format(htsFile *fp) - - # @ abstract Returns a string containing the file format extension. - # @ param format Format structure containing the file type. - # @ return A string ("sam", "bam", etc) or "?" for unknown formats. const char *hts_format_file_extension(const htsFormat *format) - - # @abstract Sets a specified CRAM option on the open file handle. - # @param fp The file handle open the open file. - # @param opt The CRAM_OPT_* option. - # @param ... Optional arguments, dependent on the option used. - # @return 0 for success, or negative if an error occurred. int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) int hts_getline(htsFile *fp, int delimiter, kstring_t *str) char **hts_readlines(const char *fn, int *_n) - - # @abstract Parse comma-separated list or read list from a file - # @param list File name or comma-separated list - # @param is_file - # @param _n Size of the output array (number of items read) - # @return NULL on failure or pointer to newly allocated array of - # strings char **hts_readlist(const char *fn, int is_file, int *_n) - # @abstract Create extra threads to aid compress/decompression for this file - # @param fp The file handle - # @param n The number of worker threads to create - # @return 0 for success, or negative if an error occurred. - # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. int hts_set_threads(htsFile *fp, int n) - - # @abstract Set .fai filename for a file opened for reading - # @return 0 for success, negative on failure - # @discussion - # Called before *_hdr_read(), this provides the name of a .fai file - # used to provide a reference list if the htsFile contains no @SQ headers. + int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) + void hts_set_cache_size(htsFile *fp, int n) int hts_set_fai_filename(htsFile *fp, const char *fn_aux) + int hts_set_filter_expression(htsFile *fp, const char *expr) + int hts_check_EOF(htsFile *fp) int8_t HTS_IDX_NOCOOR int8_t HTS_IDX_START @@ -635,135 +398,115 @@ cdef extern from "htslib/hts.h" nogil: int8_t HTS_FMT_BAI int8_t HTS_FMT_TBI int8_t HTS_FMT_CRAI + int8_t HTS_FMT_FAI - BGZF *hts_get_bgzfp(htsFile *fp) + int64_t HTS_POS_MAX + int64_t HTS_POS_MIN + const char PRIhts_pos[] + ctypedef int64_t hts_pos_t - ctypedef struct hts_idx_t + ctypedef struct hts_pair_pos_t: + hts_pos_t beg, end + + ctypedef hts_pair_pos_t hts_pair32_t # deprecated ctypedef struct hts_pair64_t: uint64_t u, v - ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) + ctypedef struct hts_pair64_max_t: + uint64_t u, v + uint64_t max + + ctypedef struct hts_reglist_t: # struct will be made opaque in future + const char *reg + hts_pair_pos_t *intervals + int tid + uint32_t count + hts_pos_t min_beg, max_end - ctypedef struct hts_bins_t: + ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end) + ctypedef int hts_seek_func(void *fp, int64_t offset, int where) + ctypedef int64_t hts_tell_func(void *fp) + + ctypedef struct hts_itr_bins_struct: int n, m int *a - ctypedef struct hts_itr_t: - uint32_t read_rest - uint32_t finished - int tid, bed, end, n_off, i - int curr_tid, curr_beg, curr_end + ctypedef struct hts_itr_t: # struct will be made opaque in future + uint8_t read_rest, finished + int tid, n_off, i + hts_pos_t beg, end + int curr_tid + hts_pos_t curr_beg, curr_end uint64_t curr_off - hts_pair64_t *off - hts_readrec_func *readfunc - hts_bins_t bins + hts_pair64_max_t *off + hts_readrec_func *readrec + hts_itr_bins_struct bins + + ctypedef hts_itr_t hts_itr_multi_t + + int hts_bin_first(int l) + int hts_bin_parent(int b) hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) void hts_idx_destroy(hts_idx_t *idx) - int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) - void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) - - #### Save an index to a file - # @param idx Index to be written - # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added - # @param fmt One of the HTS_FMT_* index formats - # @return 0 if successful, or negative if an error occurred. + int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) + int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) + int hts_idx_fmt(hts_idx_t *idx) + int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) - - #### Save an index to a specific file - # @param idx Index to be written - # @param fn Input BAM/BCF/etc filename - # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn - # @param fmt One of the HTS_FMT_* index formats - # @return 0 if successful, or negative if an error occurred. int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) - - #### Load an index file - # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or - # the extension substituted, to search for an existing index file - # @param fmt One of the HTS_FMT_* index formats - # @return The index, or NULL if an error occurred. hts_idx_t *hts_idx_load(const char *fn, int fmt) - - #### Load a specific index file - # @param fn Input BAM/BCF/etc filename - # @param fnidx The input index filename - # @return The index, or NULL if an error occurred. hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) - - #### Load a specific index file - # @param fn Input BAM/BCF/etc filename - # @param fnidx The input index filename - # @param fmt One of the HTS_FMT_* index formats - # @param flags Flags to alter behaviour (see description) - # @return The index, or NULL if an error occurred. hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) int HTS_IDX_SAVE_REMOTE int HTS_IDX_SILENT_FAIL + ctypedef const char *(*hts_id2name_f)(void *, int) + uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) - void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) - - int hts_idx_get_stat(const hts_idx_t* idx, int tid, - uint64_t* mapped, uint64_t* unmapped) - - uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) - - int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers - - # Parse a numeric string - # The number may be expressed in scientific notation, and optionally may - # contain commas in the integer part (before any decimal point or E notation). - # @param str String to be parsed - # @param strend If non-NULL, set on return to point to the first character - # in @a str after those forming the parsed number - # @param flags Or'ed-together combination of HTS_PARSE_* flags - # @return Converted value of the parsed number. - # - # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 - # or more) if there are any trailing characters after the number. + int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy) + + int hts_idx_get_stat(const hts_idx_t *idx, int tid, uint64_t *mapped, uint64_t *unmapped) + uint64_t hts_idx_get_n_no_coor(const hts_idx_t *idx) + const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) + int hts_idx_nseq(const hts_idx_t *idx) + + int HTS_PARSE_THOUSANDS_SEP + int HTS_PARSE_ONE_COORD + int HTS_PARSE_LIST + long long hts_parse_decimal(const char *str, char **strend, int flags) - # Parse a "CHR:START-END"-style region string - # @param str String to be parsed - # @param beg Set on return to the 0-based start of the region - # @param end Set on return to the 1-based end of the region - # @return Pointer to the colon or '\0' after the reference sequence name, - # or NULL if @a str could not be parsed. + ctypedef int (*hts_name2id_f)(void *, const char *) + + const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end) const char *hts_parse_reg(const char *str, int *beg, int *end) + const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, hts_name2id_f getid, void *hdr, int flags) - hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) + hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) void hts_itr_destroy(hts_itr_t *iter) - ctypedef int (*hts_name2id_f)(void*, const char*) - ctypedef const char *(*hts_id2name_f)(void*, int) - ctypedef hts_itr_t *hts_itr_query_func( - const hts_idx_t *idx, - int tid, - int beg, - int end, - hts_readrec_func *readrec) - - hts_itr_t *hts_itr_querys( - const hts_idx_t *idx, - const char *reg, - hts_name2id_f getid, - void *hdr, - hts_itr_query_func *itr_query, - hts_readrec_func *readrec) + ctypedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) + + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) - const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values - - # hts_file_type() - Convenience function to determine file type - # @fname: the file name - # - # Returns one of the FT_* defines. - # - # DEPRECATED: This function has been replaced by hts_detect_format(). - # It and these FT_* macros will be removed in a future HTSlib release. + + ctypedef int hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_t *itr) + int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) + int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) + + hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) + + int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) + + hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr, hts_name2id_f getid) + void hts_reglist_free(hts_reglist_t *reglist, int count) + + void hts_itr_multi_destroy(hts_itr_t *iter) + int FT_UNKN int FT_GZ int FT_VCF @@ -772,28 +515,13 @@ cdef extern from "htslib/hts.h" nogil: int FT_BCF_GZ int FT_STDIN - int hts_file_type(const char *fname) - - # /*************************** - # * Revised MAQ error model * - # ***************************/ + int hts_file_type(const char *fname) # deprecated (along with FT_*) ctypedef struct errmod_t errmod_t *errmod_init(double depcorr) void errmod_destroy(errmod_t *em) - - # /* - # n: number of bases - # m: maximum base - # bases[i]: qual:6, strand:1, base:4 - # q[i*m+j]: phred-scaled likelihood of (i,j) - # */ - int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic) - - # /***************************************** - # * q banded glocal alignment * - # *****************************************/ + int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) ctypedef struct probaln_par_t: float d, e @@ -806,51 +534,20 @@ cdef extern from "htslib/hts.h" nogil: const probaln_par_t *c, int *state, uint8_t *q) - # /********************** - # * MD5 implementation * - # **********************/ - ctypedef struct hts_md5_context - # /*! @abstract Initialises an MD5 context. - # * @discussion - # * The expected use is to allocate an hts_md5_context using - # * hts_md5_init(). This pointer is then passed into one or more calls - # * of hts_md5_update() to compute successive internal portions of the - # * MD5 sum, which can then be externalised as a full 16-byte MD5sum - # * calculation by calling hts_md5_final(). This can then be turned - # * into ASCII via hts_md5_hex(). - # * - # * To dealloate any resources created by hts_md5_init() call the - # * hts_md5_destroy() function. - # * - # * @return hts_md5_context pointer on success, NULL otherwise. - # */ hts_md5_context *hts_md5_init() - - # /*! @abstract Updates the context with the MD5 of the data. */ void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size) - - # /*! @abstract Computes the final 128-bit MD5 hash from the given context */ void hts_md5_final(unsigned char *digest, hts_md5_context *ctx) - - # /*! @abstract Resets an md5_context to the initial state, as returned - # * by hts_md5_init(). - # */ void hts_md5_reset(hts_md5_context *ctx) - - # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated - # * hex string. - # */ void hts_md5_hex(char *hex, const unsigned char *digest) - - # /*! @abstract Deallocates any memory allocated by hts_md5_init. */ void hts_md5_destroy(hts_md5_context *ctx) - int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) + int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) + int hts_bin_level(int bin) int hts_bin_bot(int bin, int n_lvls) + hts_pos_t hts_bin_maxpos(int min_shift, int n_lvls) - # * Endianness * int ed_is_big() uint16_t ed_swap_2(uint16_t v) void *ed_swap_2p(void *x) @@ -861,30 +558,17 @@ cdef extern from "htslib/hts.h" nogil: cdef extern from "htslib/sam.h" nogil: - #********************** - #*** SAM/BAM header *** - #********************** - - # @abstract Structure for the alignment header. - # @field n_targets number of reference sequences - # @field l_text length of the plain text in the header - # @field target_len lengths of the reference sequences - # @field target_name names of the reference sequences - # @field text plain text - # @field sdict header dictionary - - ctypedef struct bam_hdr_t: - int32_t n_targets, ignore_sam_err - uint32_t l_text - uint32_t *target_len - uint8_t *cigar_tab - char **target_name - char *text - void *sdict - - #**************************** - #*** CIGAR related macros *** - #**************************** + + ctypedef struct sam_hdr_t: # struct will be made opaque in future + int32_t n_targets, ignore_sam_err + size_t l_text + uint32_t *target_len + const int8_t *cigar_tab # deprecated (use bam_cigar_table[] instead) + char **target_name + char *text + void *sdict + + ctypedef sam_hdr_t bam_hdr_t int BAM_CMATCH int BAM_CINS @@ -897,454 +581,448 @@ cdef extern from "htslib/sam.h" nogil: int BAM_CDIFF int BAM_CBACK - char *BAM_CIGAR_STR + const char BAM_CIGAR_STR[] int BAM_CIGAR_SHIFT uint32_t BAM_CIGAR_MASK uint32_t BAM_CIGAR_TYPE - char bam_cigar_op(uint32_t c) + const int8_t bam_cigar_table[256] + + int bam_cigar_op(uint32_t c) uint32_t bam_cigar_oplen(uint32_t c) - char bam_cigar_opchr(uint32_t) - uint32_t bam_cigar_gen(char, uint32_t) - int bam_cigar_type(char o) + char bam_cigar_opchr(uint32_t c) + uint32_t bam_cigar_gen(uint32_t l, int o) + int bam_cigar_type(int o) - # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair int BAM_FPAIRED - # @abstract the read is mapped in a proper pair int BAM_FPROPER_PAIR - # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR int BAM_FUNMAP - # @abstract the mate is unmapped int BAM_FMUNMAP - # @abstract the read is mapped to the reverse strand int BAM_FREVERSE - # @abstract the mate is mapped to the reverse strand int BAM_FMREVERSE - # @abstract this is read1 int BAM_FREAD1 - # @abstract this is read2 int BAM_FREAD2 - # @abstract not primary alignment int BAM_FSECONDARY - # @abstract QC failure int BAM_FQCFAIL - # @abstract optical or PCR duplicate int BAM_FDUP - # @abstract supplementary alignment int BAM_FSUPPLEMENTARY - #************************* - #*** Alignment records *** - #************************* - - # @abstract Structure for core alignment information. - # @field tid chromosome ID, defined by bam_hdr_t - # @field pos 0-based leftmost coordinate - # @field bin bin calculated by bam_reg2bin() - # @field qual mapping quality - # @field l_qname length of the query name - # @field flag bitwise flag - # @field n_cigar number of CIGAR operations - # @field l_qseq length of the query sequence (read) - # @field mtid chromosome ID of next read in template, defined by bam_hdr_t - # @field mpos 0-based leftmost coordinate of next read in template - ctypedef struct bam1_core_t: + hts_pos_t pos int32_t tid - int32_t pos uint16_t bin uint8_t qual - uint8_t l_qname - uint16_t flag - uint8_t unused1 uint8_t l_extranul + uint16_t flag + uint16_t l_qname uint32_t n_cigar int32_t l_qseq int32_t mtid - int32_t mpos - int32_t isize - - # @abstract Structure for one alignment. - # @field core core information about the alignment - # @field l_data current length of bam1_t::data - # @field m_data maximum length of bam1_t::data - # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux - # - # @discussion Notes: - # - # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. - # 2. l_qseq is calculated from the total length of an alignment block - # on reading or from CIGAR. - # 3. cigar data is encoded 4 bytes per CIGAR operation. - # 4. seq is nybble-encoded according to seq_nt16_table. + hts_pos_t mpos + hts_pos_t isize + ctypedef struct bam1_t: bam1_core_t core + uint64_t id + uint8_t *data int l_data uint32_t m_data - uint8_t *data - uint64_t id - - # @abstract Get whether the query is on the reverse strand - # @param b pointer to an alignment - # @return boolean true if query is on the reverse strand - int bam_is_rev(bam1_t *b) - # @abstract Get whether the query's mate is on the reverse strand - # @param b pointer to an alignment - # @return boolean true if query's mate on the reverse strand - int bam_is_mrev(bam1_t *b) + int bam_is_rev(const bam1_t *b) + int bam_is_mrev(const bam1_t *b) - # @abstract Get the name of the query - # @param b pointer to an alignment - # @return pointer to the name string, null terminated char *bam_get_qname(bam1_t *b) - - # @abstract Get the CIGAR array - # @param b pointer to an alignment - # @return pointer to the CIGAR array - # - # @discussion In the CIGAR array, each element is a 32-bit integer. The - # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the - # length of a CIGAR. uint32_t *bam_get_cigar(bam1_t *b) - - # @abstract Get query sequence - # @param b pointer to an alignment - # @return pointer to sequence - # - # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, - # 8 for T and 15 for N. Two bases are packed in one byte with the base - # at the higher 4 bits having smaller coordinate on the read. It is - # recommended to use bam_seqi() macro to get the base. - char *bam_get_seq(bam1_t *b) - - # @abstract Get query quality - # @param b pointer to an alignment - # @return pointer to quality string + uint8_t *bam_get_seq(bam1_t *b) uint8_t *bam_get_qual(bam1_t *b) - - # @abstract Get auxiliary data - # @param b pointer to an alignment - # @return pointer to the concatenated auxiliary data uint8_t *bam_get_aux(bam1_t *b) + int bam_get_l_aux(const bam1_t *b) + uint8_t bam_seqi(uint8_t *s, int i) + void bam_set_seqi(uint8_t *s, int i, uint8_t b) + + sam_hdr_t *sam_hdr_init() + sam_hdr_t *bam_hdr_read(BGZF *fp) + int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) + void sam_hdr_destroy(sam_hdr_t *h) + sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) + + sam_hdr_t *bam_hdr_init() # deprecated + void bam_hdr_destroy(sam_hdr_t *h) # deprecated + sam_hdr_t *bam_hdr_dup(const sam_hdr_t *h0) # deprecated + + ctypedef htsFile samFile + + sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) + sam_hdr_t *sam_hdr_read(samFile *fp) + int sam_hdr_write(samFile *fp, const sam_hdr_t *h) + size_t sam_hdr_length(sam_hdr_t *h) + const char *sam_hdr_str(sam_hdr_t *h) + int sam_hdr_nref(const sam_hdr_t *h) + + int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len) + int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...) + int sam_hdr_find_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_val, kstring_t *ks) + int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type, int pos, kstring_t *ks) + int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value) + int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position) + int sam_hdr_update_line(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, ...) + int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value) + int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh) + int sam_hdr_count_lines(sam_hdr_t *h, const char *type) + int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key) + const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos) + + int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks) + int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks) + int sam_hdr_remove_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key) + int sam_hdr_name2tid(sam_hdr_t *h, const char *ref) + const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) + hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) + int bam_name2id(sam_hdr_t *h, const char *ref) + const char *sam_hdr_pg_id(sam_hdr_t *h, const char *name) + int sam_hdr_add_pg(sam_hdr_t *h, const char *name, ...) + char *stringify_argv(int argc, char *argv[]) + void sam_hdr_incr_ref(sam_hdr_t *h) - # @abstract Get length of auxiliary data - # @param b pointer to an alignment - # @return length of the concatenated auxiliary data - int bam_get_l_aux(bam1_t *b) - - # @abstract Get a base on read - # @param s Query sequence returned by bam1_seq() - # @param i The i-th position, 0-based - # @return 4-bit integer representing the base. - char bam_seqi(char *s, int i) - - #************************** - #*** Exported functions *** - #************************** - - #*************** - #*** BAM I/O *** - #*************** - - bam_hdr_t *bam_hdr_init() - bam_hdr_t *bam_hdr_read(BGZF *fp) - int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) - void bam_hdr_destroy(bam_hdr_t *h) - int bam_name2id(bam_hdr_t *h, const char *ref) - bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) + int sam_hdr_find_hd(sam_hdr_t *h, kstring_t *ks) + int sam_hdr_find_tag_hd(sam_hdr_t *h, const char *key, kstring_t *ks) + int sam_hdr_update_hd(sam_hdr_t *h, ...) + int sam_hdr_remove_tag_hd(sam_hdr_t *h, const char *key) bam1_t *bam_init1() void bam_destroy1(bam1_t *b) + + int BAM_USER_OWNS_STRUCT + int BAM_USER_OWNS_DATA + void bam_set_mempolicy(bam1_t *b, uint32_t policy) + uint32_t bam_get_mempolicy(bam1_t *b) + int bam_read1(BGZF *fp, bam1_t *b) int bam_write1(BGZF *fp, const bam1_t *b) bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) bam1_t *bam_dup1(const bam1_t *bsrc) + int bam_set1(bam1_t *bam, size_t l_qname, const char *qname, uint16_t flag, int32_t tid, hts_pos_t pos, + uint8_t mapq, size_t n_cigar, const uint32_t *cigar, int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, size_t l_aux) - int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) - int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) - - # @abstract Calculate the rightmost base position of an alignment on the - # reference genome. + hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) + hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) + hts_pos_t bam_endpos(const bam1_t *b) - # @param b pointer to an alignment - # @return the coordinate of the first base after the alignment, 0-based + int bam_str2flag(const char *str) + char *bam_flag2str(int flag) - # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. - # For an unmapped read (either according to its flags or if it has no cigar - # string), we return b->core.pos + 1 by convention. - int32_t bam_endpos(const bam1_t *b) + int bam_set_qname(bam1_t *b, const char *qname) + ssize_t sam_parse_cigar(const char *instr, char **end, uint32_t **a_cigar, size_t *a_mem) + ssize_t bam_parse_cigar(const char *instr, char **end, bam1_t *b) - int bam_str2flag(const char *str) # returns negative value on error - char *bam_flag2str(int flag) # The string must be freed by the user + void bam_itr_destroy(hts_itr_t *iter) # deprecated + hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) # deprecated + hts_itr_t *bam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) # deprecated + int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) # deprecated - #************************* - #*** BAM/CRAM indexing *** - #************************* - - # These BAM iterator functions work only on BAM files. To work with either - # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. - void bam_itr_destroy(hts_itr_t *iter) - hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) - int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) - - # Load/build .csi or .bai BAM index file. Does not work with CRAM. - # It is recommended to use the sam_index_* functions below instead. hts_idx_t *bam_index_load(const char *fn) int bam_index_build(const char *fn, int min_shift) - - # Load a BAM (.csi or .bai) or CRAM (.crai) index file - # @param fp File handle of the data file whose index is being opened - # @param fn BAM/CRAM/etc filename to search alongside for the index file - # @return The index, or NULL if an error occurred. + int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) + int sam_idx_save(htsFile *fp) hts_idx_t *sam_index_load(htsFile *fp, const char *fn) - - # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file - # @param fp File handle of the data file whose index is being opened - # @param fn BAM/CRAM/etc data file filename - # @param fnidx Index filename, or NULL to search alongside @a fn - # @return The index, or NULL if an error occurred. hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) - - # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file - # @param fp File handle of the data file whose index is being opened - # @param fn BAM/CRAM/etc data file filename - # @param fnidx Index filename, or NULL to search alongside @a fn - # @param flags Flags to alter behaviour - # @return The index, or NULL if an error occurred. hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) - - # Generate and save an index file - # @param fn Input BAM/etc filename, to which .csi/etc will be added - # @param min_shift Positive to generate CSI, or 0 to generate BAI - # @return 0 if successful, or negative if an error occurred (usually -1; or - # -2: opening fn failed; -3: format not indexable) int sam_index_build(const char *fn, int min_shift) - - # Generate and save an index to a specific file - # @param fn Input BAM/CRAM/etc filename - # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn - # @param min_shift Positive to generate CSI, or 0 to generate BAI - # @return 0 if successful, or negative if an error occurred. int sam_index_build2(const char *fn, const char *fnidx, int min_shift) + int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) void sam_itr_destroy(hts_itr_t *iter) - hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) - int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) + hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) + hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) + hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) + hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) + int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) + int sam_itr_multi_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) - #*************** - #*** SAM I/O *** - #*************** + const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, int flags) - htsFile *sam_open(const char *fn, const char *mode) - htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) - int sam_close(htsFile *fp) + samFile *sam_open(const char *fn, const char *mode) + samFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) + int sam_flush(samFile *fp) + int sam_close(samFile *fp) int sam_open_mode(char *mode, const char *fn, const char *format) - - # A version of sam_open_mode that can handle ,key=value options. - # The format string is allocated and returned, to be freed by the caller. - # Prefix should be "r" or "w", char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) - bam_hdr_t *sam_hdr_parse(int l_text, const char *text) - bam_hdr_t *sam_hdr_read(htsFile *fp) - int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) + int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) + + int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) + int sam_format1(const sam_hdr_t *h, const bam1_t *b, kstring_t *str) + int sam_read1(samFile *fp, sam_hdr_t *h, bam1_t *b) + int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) - int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) - int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) - int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) - int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) + cdef struct hts_filter_t + int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) - #************************************* - #*** Manipulating auxiliary fields *** - #************************************* + const uint8_t *sam_format_aux1(const uint8_t *key, const uint8_t type, const uint8_t *tag, const uint8_t *end, kstring_t *ks) - uint8_t *bam_aux_get(const bam1_t *b, const char *tag) + uint8_t *bam_aux_first(const bam1_t *b) + uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) + const char *bam_aux_tag(const uint8_t *s) + char bam_aux_type(const uint8_t *s) + int bam_aux_get_str(const bam1_t *b, const char tag[2], kstring_t *s) int64_t bam_aux2i(const uint8_t *s) double bam_aux2f(const uint8_t *s) char bam_aux2A(const uint8_t *s) char *bam_aux2Z(const uint8_t *s) + uint32_t bam_auxB_len(const uint8_t *s) + int64_t bam_auxB2i(const uint8_t *s, uint32_t idx) + double bam_auxB2f(const uint8_t *s, uint32_t idx) - void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) + int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) int bam_aux_del(bam1_t *b, uint8_t *s) + uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) + int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) + int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val) + int bam_aux_update_float(bam1_t *b, const char tag[2], float val) + int bam_aux_update_array(bam1_t *b, const char tag[2], uint8_t type, uint32_t items, void *data) - #************************** - #*** Pileup and Mpileup *** - #************************** - - # @abstract Generic pileup 'client data'. - # @discussion The pileup iterator allows setting a constructor and - # destructor function, which will be called every time a sequence is - # fetched and discarded. This permits caching of per-sequence data in - # a tidy manner during the pileup process. This union is the cached - # data to be manipulated by the "client" (the caller of pileup). - # - union bam_pileup_cd: + ctypedef union bam_pileup_cd: void *p int64_t i double f - # @abstract Structure for one alignment covering the pileup position. - # @field b pointer to the alignment - # @field qpos position of the read base at the pileup site, 0-based - # @field indel indel length; 0 for no indel, positive for ins and negative for del - # @field level the level of the read in the "viewer" mode - # @field is_del 1 iff the base on the padded read is a deletion - # @field is_head ??? - # @field is_tail ??? - # @field is_refskip ??? - # @field aux ??? - # - # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The - # difference between the two functions is that the former does not - # set bam_pileup1_t::level, while the later does. Level helps the - # implementation of alignment viewers, but calculating this has some - # overhead. - # - # is_del, is_head, etc are a bit field, declaring as below should - # work as expected, see - # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J - ctypedef struct bam_pileup1_t: bam1_t *b int32_t qpos int indel, level - uint32_t is_del - uint32_t is_head - uint32_t is_tail - uint32_t is_refskip + uint8_t is_del, is_head, is_tail, is_refskip uint32_t aux bam_pileup_cd cd + int cigar_ind ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) - ctypedef int (*bam_test_f)() - ctypedef struct __bam_plp_t - ctypedef __bam_plp_t *bam_plp_t + cdef struct bam_plp_s + ctypedef bam_plp_s *bam_plp_t - ctypedef struct __bam_mplp_t - ctypedef __bam_mplp_t *bam_mplp_t + cdef struct bam_mplp_s + ctypedef bam_mplp_s *bam_mplp_t - # bam_plp_init() - sets an iterator over multiple - # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return - # status: 0 on success, -1 on end, < -1 on non-recoverable errors - # @data: user data to pass to @func bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) void bam_plp_destroy(bam_plp_t iter) int bam_plp_push(bam_plp_t iter, const bam1_t *b) const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) + const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) void bam_plp_reset(bam_plp_t iter) + void bam_plp_constructor(bam_plp_t plp, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + void bam_plp_destructor (bam_plp_t plp, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) - bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) + ctypedef struct hts_base_mod_state + int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m, kstring_t *ins, int *del_len) - # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping - # read pairs and for each base pair set the base quality of the - # lower-quality base to zero, thus effectively discarding it from - # calling. If the two bases are identical, the quality of the other base - # is increased to the sum of their qualities (capped at 200), otherwise - # it is multiplied by 0.8. - void bam_mplp_init_overlaps(bam_mplp_t iter) + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) + int bam_mplp_init_overlaps(bam_mplp_t iter) void bam_mplp_destroy(bam_mplp_t iter) void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) + int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) void bam_mplp_reset(bam_mplp_t iter) - void bam_mplp_constructor(bam_mplp_t iter, - int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) - void bam_mplp_destructor(bam_mplp_t iter, - int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) - - # Added by AH - # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" - - - - - # // --------------------------- - # // Base modification retrieval - - # /*! @typedef - # @abstract Holds a single base modification. - # @field modified_base The short base code (m, h, etc) or -ChEBI (negative) - # @field canonical_base The canonical base referred to in the MM tag. - # One of A, C, G, T or N. Note this may not be the - # explicit base recorded in the SEQ column (esp. if N). - # @field strand 0 or 1, indicating + or - strand from MM tag. - # @field qual Quality code (256*probability), or -1 if unknown + void bam_mplp_constructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + void bam_mplp_destructor (bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + + int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) + + int BAQ_APPLY + int BAQ_EXTEND + int BAQ_REDO + int BAQ_AUTO + int BAQ_ILLUMINA + int BAQ_PACBIOCCS + int BAQ_PACBIO + int BAQ_ONT + int BAQ_GENAPSYS + int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) - # @discussion - # Note this doesn't hold any location data or information on which other - # modifications may be possible at this site. ctypedef struct hts_base_mod: int modified_base int canonical_base int strand int qual - # /// Allocates an hts_base_mode_state. - # /** - # * @return An hts_base_mode_state pointer on success, - # * NULL on failure. - # * - # * This just allocates the memory. The initialisation of the contents is - # * done using bam_parse_basemod. Successive calls may be made to that - # * without the need to free and allocate a new state. - # * - # * The state be destroyed using the hts_base_mode_state_free function. - # */ - ctypedef struct hts_base_mod_state - hts_base_mod_state *hts_base_mod_state_alloc() + int HTS_MOD_UNKNOWN + int HTS_MOD_UNCHECKED + uint32_t HTS_MOD_REPORT_UNCHECKED - # /// Destroys an hts_base_mode_state. - # /** - # * @param state The base modification state pointer. - # * - # * The should have previously been created by hts_base_mode_state_alloc. - # */ + hts_base_mod_state *hts_base_mod_state_alloc() void hts_base_mod_state_free(hts_base_mod_state *state) - - # /// Parses the Mm and Ml tags out of a bam record. - # /** - # * @param b BAM alignment record - # * @param state The base modification state pointer. - # * @return 0 on success, - # * -1 on failure. - # * - # * This fills out the contents of the modification state, resetting the - # * iterator location to the first sequence base. - # */ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) + int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, uint32_t flags) + int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods) + int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods, int *pos) + int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, hts_base_mod *mods, int n_mods) + int bam_mods_query_type(hts_base_mod_state *state, int code, int *strand, int *implicit, char *canonical) + int bam_mods_queryi(hts_base_mod_state *state, int i, int *strand, int *implicit, char *canonical) + int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) + + +cdef extern from "htslib/cram.h" nogil: + + cdef enum cram_block_method: + CRAM_COMP_UNKNOWN + CRAM_COMP_RAW + CRAM_COMP_GZIP + CRAM_COMP_BZIP2 + CRAM_COMP_LZMA + CRAM_COMP_RANS4x8 + CRAM_COMP_RANSNx16 + CRAM_COMP_ARITH + CRAM_COMP_FQZ + CRAM_COMP_TOK3 + + ctypedef struct cram_method_details: + cram_block_method method + int level + int order + int rle + int pack + int stripe + int cat + int nosz + int Nway + int ext + + cdef enum cram_content_type: + CT_ERROR + FILE_HEADER + COMPRESSION_HEADER + MAPPED_SLICE + UNMAPPED_SLICE + EXTERNAL + CORE + + ctypedef struct cram_file_def + ctypedef struct cram_fd + ctypedef struct cram_container + ctypedef struct cram_block + ctypedef struct cram_slice + ctypedef struct cram_metrics + ctypedef struct cram_block_slice_hdr + ctypedef struct cram_block_compression_hdr + ctypedef struct cram_codec + ctypedef struct refs_t + + sam_hdr_t *cram_fd_get_header(cram_fd *fd) + void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr) + + int cram_fd_get_version(cram_fd *fd) + void cram_fd_set_version(cram_fd *fd, int vers) + + int cram_major_vers(cram_fd *fd) + int cram_minor_vers(cram_fd *fd) + + hFILE *cram_fd_get_fp(cram_fd *fd) + void cram_fd_set_fp(cram_fd *fd, hFILE *fp) + + int32_t cram_container_get_length(cram_container *c) + void cram_container_set_length(cram_container *c, int32_t length) + int32_t cram_container_get_num_blocks(cram_container *c) + void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) + int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) + void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, int32_t *landmarks) + int32_t cram_container_get_num_records(cram_container *c) + int64_t cram_container_get_num_bases(cram_container *c) + int cram_container_is_empty(cram_fd *fd) + void cram_container_get_coords(cram_container *c, int *refid, hts_pos_t *start, hts_pos_t *span) + + int32_t cram_block_get_content_id(cram_block *b) + int32_t cram_block_get_comp_size(cram_block *b) + int32_t cram_block_get_uncomp_size(cram_block *b) + int32_t cram_block_get_crc32(cram_block *b) + void * cram_block_get_data(cram_block *b) + + cram_content_type cram_block_get_content_type(cram_block *b) + cram_block_method cram_block_get_method(cram_block *b) + cram_method_details *cram_expand_method(uint8_t *data, int32_t size, cram_block_method comp) + + void cram_block_set_content_id(cram_block *b, int32_t id) + void cram_block_set_comp_size(cram_block *b, int32_t size) + void cram_block_set_uncomp_size(cram_block *b, int32_t size) + void cram_block_set_crc32(cram_block *b, int32_t crc) + void cram_block_set_data(cram_block *b, void *data) + + int cram_block_append(cram_block *b, const void *data, int size) + void cram_block_update_size(cram_block *b) + + size_t cram_block_get_offset(cram_block *b) + void cram_block_set_offset(cram_block *b, size_t offset) + uint32_t cram_block_size(cram_block *b) + + void cram_codec_get_content_ids(cram_codec *c, int ids[2]) + int cram_codec_describe(cram_codec *c, kstring_t *ks) + + int cram_transcode_rg(cram_fd *input, cram_fd *output, cram_container *c, int nrg, int *in_rg, int *out_rg) + + int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) + int cram_filter_container(cram_fd *input, cram_fd *output, cram_container *c, int *ref_id) + cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block *b) + void cram_free_compression_header(cram_block_compression_hdr *hdr) + + ctypedef struct cram_cid2ds_t - # /// Finds the next location containing base modifications and returns them - # /** - # * @param b BAM alignment record - # * @param state The base modification state pointer. - # * @param mods A supplied array for returning base modifications - # * @param n_mods The size of the mods array - # * @return The number of modifications found on success, - # * 0 if no more modifications are present, - # * -1 on failure. - # * - # * Unlike bam_mods_at_next_pos this skips ahead to the next site - # * with modifications. - # * - # * If more than n_mods modifications are found, the total found is returned. - # * Note this means the caller needs to check whether this is higher than - # * n_mods. - # */ - - int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos) - - # *********************************** - # * BAQ calculation and realignment * - # ***********************************/ - int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) - int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) + cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, cram_cid2ds_t *cid2ds) + int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n) + void cram_cid2ds_free(cram_cid2ds_t *cid2ds) + + int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks) + + int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) + int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) + void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, int *refid, hts_pos_t *start, hts_pos_t *span) + cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) + void cram_free_slice_header(cram_block_slice_hdr *hdr) + + cram_block *cram_new_block(cram_content_type content_type, int content_id) + cram_block *cram_read_block(cram_fd *fd) + int cram_write_block(cram_fd *fd, cram_block *b) + void cram_free_block(cram_block *b) + int cram_uncompress_block(cram_block *b) + int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level) + int cram_compress_block2(cram_fd *fd, cram_slice *s, cram_block *b, cram_metrics *metrics, int method, int level) + + cram_container *cram_new_container(int nrec, int nslice) + void cram_free_container(cram_container *c) + cram_container *cram_read_container(cram_fd *fd) + int cram_write_container(cram_fd *fd, cram_container *h) + int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) + int cram_container_size(cram_container *c) + + cram_fd *cram_open(const char *filename, const char *mode) + cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) + int cram_close(cram_fd *fd) + int cram_seek(cram_fd *fd, off_t offset, int whence) + int cram_flush(cram_fd *fd) + int cram_eof(cram_fd *fd) + int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) + int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) + int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) + int cram_check_EOF(cram_fd *fd) + + int int32_put_blk(cram_block *b, int32_t val) + + ctypedef sam_hdr_t SAM_hdr # deprecated + SAM_hdr *sam_hdr_parse_(const char *hdr, size_t len) # deprecated + void sam_hdr_free(SAM_hdr *hdr) # deprecated + int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) # deprecated + + refs_t *cram_get_refs(htsFile *fd) + int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, off_t *first, off_t *last) + int64_t cram_num_containers(cram_fd *fd) + int64_t cram_num_containers_between(cram_fd *fd, off_t cstart, off_t cend, int64_t *first, int64_t *last) + off_t cram_container_num2offset(cram_fd *fd, int64_t n) + int64_t cram_container_offset2num(cram_fd *fd, off_t pos) cdef extern from "htslib/faidx.h" nogil: @@ -1352,201 +1030,133 @@ cdef extern from "htslib/faidx.h" nogil: ctypedef struct faidx_t: pass - # /// Build index for a FASTA or bgzip-compressed FASTA file. - # /** @param fn FASTA file name - # @param fnfai Name of .fai file to build. - # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). - # @return 0 on success; or -1 on failure - - # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. - # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI - # file will only be built if fn is bgzip-compressed. - # */ - int fai_build3(const char *fn, - const char *fnfai, - const char *fngzi) - - # /// Build index for a FASTA or bgzip-compressed FASTA file. - # /** @param fn FASTA file name - # @return 0 on success; or -1 on failure - # - # File "fn.fai" will be generated. This function is equivalent to - # fai_build3(fn, NULL, NULL); - # */ - int fai_build(char *fn) - - # /// Destroy a faidx_t struct + cdef enum fai_format_options: + FAI_NONE + FAI_FASTA + FAI_FASTQ + + int fai_build3(const char *fn, const char *fnfai, const char *fngzi) + int fai_build(const char *fn) void fai_destroy(faidx_t *fai) - # /// Load FASTA indexes. - # /** @param fn File name of the FASTA file (can be compressed with bgzip). - # @param fnfai File name of the FASTA index. - # @param fngzi File name of the bgzip index. - # @param flags Option flags to control index file caching and creation. - # @return Pointer to a faidx_t struct on success, NULL on failure. - - # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. - # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. - # The bgzip index is only needed if fn is compressed. - - # If (flags & FAI_CREATE) is true, the index files will be built using - # fai_build3() if they are not already present. - # */ - faidx_t *fai_load3(const char *fn, - const char *fnfai, - const char *fngzi, - int flags) - - # /// Load index from "fn.fai". - # /** @param fn File name of the FASTA file - # @return Pointer to a faidx_t struct on success, NULL on failure. - # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); - # */ - faidx_t *fai_load(char *fn) - - # /// Fetch the sequence in a region - # /** @param fai Pointer to the faidx_t struct - # @param reg Region in the format "chr2:20,000-30,000" - # @param len Length of the region; -2 if seq not present, -1 general error - # @return Pointer to the sequence; `NULL` on failure - # The returned sequence is allocated by `malloc()` family and should be destroyed - # by end users by calling `free()` on it. - # */ - char *fai_fetch(faidx_t *fai, - char *reg, - int *len) - - # /// Fetch the sequence in a region - # /** @param fai Pointer to the faidx_t struct - # @param c_name Region name - # @param p_beg_i Beginning position number (zero-based) - # @param p_end_i End position number (zero-based) - # @param len Length of the region; -2 if c_name not present, -1 general error - # @return Pointer to the sequence; null on failure - # The returned sequence is allocated by `malloc()` family and should be destroyed - # by end users by calling `free()` on it. - # */ - char *faidx_fetch_seq(faidx_t *fai, - char *c_name, - int p_beg_i, - int p_end_i, - int *len) - - # /// Query if sequence is present - # /** @param fai Pointer to the faidx_t struct - # @param seq Sequence name - # @return 1 if present or 0 if absent - # */ - int faidx_has_seq(faidx_t *fai, const char *seq) - - # /// Fetch the number of sequences - # /** @param fai Pointer to the faidx_t struct - # @return The number of sequences - # */ + cdef enum fai_load_options: + FAI_CREATE + + faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi, int flags) + faidx_t *fai_load(const char *fn) + faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi, int flags, fai_format_options format) + faidx_t *fai_load_format(const char *fn, fai_format_options format) + + char *fai_fetch(const faidx_t *fai, const char *reg, int *len) + char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len) + hts_pos_t fai_line_length(const faidx_t *fai, const char *reg) + char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len) + char *fai_fetchqual64(const faidx_t *fai, const char *reg, hts_pos_t *len) + int faidx_fetch_nseq(const faidx_t *fai) # deprecated + char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) + char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) + char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) + char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) + + int faidx_has_seq(const faidx_t *fai, const char *seq) int faidx_nseq(const faidx_t *fai) - - # /// Return name of i-th sequence const char *faidx_iseq(const faidx_t *fai, int i) + hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq) + int faidx_seq_len(const faidx_t *fai, const char *seq) + + const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, int flags) + int fai_adjust_region(const faidx_t *fai, int tid, hts_pos_t *beg, hts_pos_t *end) + + void fai_set_cache_size(faidx_t *fai, int cache_size) + int fai_thread_pool(faidx_t *fai, hts_tpool *pool, int qsize) + + char *fai_path(const char *fa) - # /// Return sequence length, -1 if not present - int faidx_seq_len(faidx_t *fai, const char *seq) -# tabix support cdef extern from "htslib/tbx.h" nogil: - # tbx.h definitions int8_t TBX_MAX_SHIFT int32_t TBX_GENERIC int32_t TBX_SAM int32_t TBX_VCF + int32_t TBX_GAF int32_t TBX_UCSC ctypedef struct tbx_conf_t: int32_t preset - int32_t sc, bc, ec # seq col., beg col. and end col. + int32_t sc, bc, ec int32_t meta_char, line_skip - ctypedef struct tbx_t: + ctypedef struct tbx_t: # struct will be made opaque in future tbx_conf_t conf hts_idx_t *idx - void * dict + void *dict tbx_conf_t tbx_conf_gff tbx_conf_t tbx_conf_bed tbx_conf_t tbx_conf_psltbl tbx_conf_t tbx_conf_sam tbx_conf_t tbx_conf_vcf + tbx_conf_t tbx_conf_gaf - void tbx_itr_destroy(hts_itr_t * iter) - hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) - hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) - int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) + void tbx_itr_destroy(hts_itr_t *iter) + hts_itr_t *tbx_itr_queryi(tbx_t *tbx, int tid, int beg, int end) + hts_itr_t *tbx_itr_querys(tbx_t *tbx, const char *s) + int tbx_itr_next(htsFile *fp, tbx_t *tbx, hts_itr_t *itr, kstring_t *r) + int tbx_bgzf_itr_next(BGZF *fp, tbx_t *tbx, hts_itr_t *itr, kstring_t *r) - int tbx_name2id(tbx_t *tbx, char *ss) + int tbx_name2id(tbx_t *tbx, const char *ss) + BGZF *hts_get_bgzfp(htsFile *fp) - int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) + tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) + int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf) int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) + int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf) - tbx_t * tbx_index_load(char *fn) + tbx_t *tbx_index_load(const char *fn) tbx_t *tbx_index_load2(const char *fn, const char *fnidx) tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) - # free the array but not the values - char **tbx_seqnames(tbx_t *tbx, int *n) + const char **tbx_seqnames(tbx_t *tbx, int *n) void tbx_destroy(tbx_t *tbx) -# VCF/BCF API cdef extern from "htslib/vcf.h" nogil: - # Header struct - - uint8_t BCF_HL_FLT # header line + uint8_t BCF_HL_FLT uint8_t BCF_HL_INFO uint8_t BCF_HL_FMT uint8_t BCF_HL_CTG - uint8_t BCF_HL_STR # structured header line TAG= - uint8_t BCF_HL_GEN # generic header line + uint8_t BCF_HL_STR + uint8_t BCF_HL_GEN - uint8_t BCF_HT_FLAG # header type + uint8_t BCF_HT_FLAG uint8_t BCF_HT_INT uint8_t BCF_HT_REAL uint8_t BCF_HT_STR - uint8_t BCF_VL_FIXED # variable length + uint8_t BCF_VL_FIXED uint8_t BCF_VL_VAR uint8_t BCF_VL_A uint8_t BCF_VL_G uint8_t BCF_VL_R - # === Dictionary === - # - # The header keeps three dictionaries. The first keeps IDs in the - # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths - # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] - # is the actual hash table, which is opaque to the end users. In the hash - # table, the key is the ID or sample name as a C string and the value is a - # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash - # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the - # size of the hash table or, equivalently, the length of the id[] arrays. - - uint8_t BCF_DT_ID # dictionary type + uint8_t BCF_DT_ID uint8_t BCF_DT_CTG uint8_t BCF_DT_SAMPLE - # Complete textual representation of a header line ctypedef struct bcf_hrec_t: - int type # One of the BCF_HL_* type - char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. - char *value # Set only for generic lines, NULL for FILTER/INFO, etc. - int nkeys # Number of structured fields - char **keys # The key=value pairs + int type + char *key + char *value + int nkeys + char **keys char **vals ctypedef struct bcf_idinfo_t: - uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] - bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG + uint64_t info[3] + bcf_hrec_t *hrec[3] int id ctypedef struct bcf_idpair_t: @@ -1554,23 +1164,21 @@ cdef extern from "htslib/vcf.h" nogil: const bcf_idinfo_t *val ctypedef struct bcf_hdr_t: - int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) + int32_t n[3] bcf_idpair_t *id[3] - void *dict[3] # ID dictionary, contig dict and sample dict + void *dict[3] char **samples bcf_hrec_t **hrec int nhrec, dirty int ntransl - int *transl[2] # for bcf_translate() - int nsamples_ori # for bcf_hdr_set_samples() + int *transl[2] + int nsamples_ori uint8_t *keep_samples kstring_t mem - int32_t m[3] # m: allocated size of the dictionary block in use (see n above) + int32_t m[3] uint8_t bcf_type_shift[] - # * VCF record * - uint8_t BCF_BT_NULL uint8_t BCF_BT_INT8 uint8_t BCF_BT_INT16 @@ -1586,33 +1194,30 @@ cdef extern from "htslib/vcf.h" nogil: uint8_t VCF_BND uint8_t VCF_OVERLAP - - ctypedef struct variant_t: - int type, n # variant type and the number of bases affected, negative for deletions + ctypedef struct bcf_variant_t: + int type, n ctypedef struct bcf_fmt_t: - int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key - int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types - uint8_t *p # same as vptr and vptr_* in bcf_info_t below + int id + int n, size, type + uint8_t *p uint32_t p_len uint32_t p_off uint8_t p_free - union bcf_info_union_t: - int32_t i # integer value - float f # float value + cdef union bcf_info_v1_union: + int64_t i + float f ctypedef struct bcf_info_t: - int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key - int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars - - # v1 union only set if $len==1; for easier access - bcf_info_union_t v1 - uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes - uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset - uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes - uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new - # data block is bigger than the original + int key + int type + bcf_info_v1_union v1 + uint8_t *vptr + uint32_t vptr_len + uint32_t vptr_off + uint8_t vptr_free + int len uint8_t BCF1_DIRTY_ID uint8_t BCF1_DIRTY_ALS @@ -1620,18 +1225,18 @@ cdef extern from "htslib/vcf.h" nogil: uint8_t BCF1_DIRTY_INF ctypedef struct bcf_dec_t: - int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change - int n_flt # Number of FILTER fields - int *flt # FILTER keys in the dictionary - char *id # ID - char *als # REF+ALT block (\0-seperated) - char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated - bcf_info_t *info # INFO - bcf_fmt_t *fmt # FORMAT and individual sample - variant_t *var # $var and $var_type set only when set_variant_types called + int m_fmt, m_info, m_id, m_als, m_allele, m_flt + int n_flt + int *flt + char *id + char *als + char **allele + bcf_info_t *info + bcf_fmt_t *fmt + bcf_variant_t *var int n_var, var_type - int shared_dirty # if set, shared.s must be recreated on BCF output - int indiv_dirty # if set, indiv.s must be recreated on BCF output + int shared_dirty + int indiv_dirty uint8_t BCF_ERR_CTG_UNDEF uint8_t BCF_ERR_TAG_UNDEF @@ -1641,234 +1246,98 @@ cdef extern from "htslib/vcf.h" nogil: uint8_t BCF_ERR_CTG_INVALID uint8_t BCF_ERR_TAG_INVALID - # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file - # is slower because the string is first to be parsed, packed into BCF line - # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it - # is known in advance that some of the fields will not be required (notably - # the sample columns), parsing of these can be skipped by setting max_unpack - # appropriately. - # Similarly, it is fast to output a BCF line because the columns (kept in - # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF - # line must be formatted in vcf_format. + const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) ctypedef struct bcf1_t: - int32_t rid # CHROM - int32_t pos # POS - int32_t rlen # length of REF - float qual # QUAL + hts_pos_t pos + hts_pos_t rlen + int32_t rid + float qual uint32_t n_info, n_allele uint32_t n_fmt, n_sample kstring_t shared, indiv - bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() - int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed - int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work - int unpack_size[3] # the original block size of ID, REF+ALT and FILTER - int errcode # one of BCF_ERR_* codes - - ####### API ####### - - # BCF and VCF I/O - # - # A note about naming conventions: htslib internally represents VCF - # records as bcf1_t data structures, therefore most functions are - # prefixed with bcf_. There are a few exceptions where the functions must - # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In - # these cases, functions prefixed with bcf_ are more general and work - # with both BCF and VCF. - - # bcf_hdr_init() - create an empty BCF header. - # @param mode "r" or "w" - # - # When opened for writing, the mandatory fileFormat and - # FILTER=PASS lines are added automatically. - bcf_hdr_t *bcf_hdr_init(const char *mode) + bcf_dec_t d + int max_unpack + int unpacked + int unpack_size[3] + int errcode - # Destroy a BCF header struct - void bcf_hdr_destroy(bcf_hdr_t *h) + bcf1_t *bcf_init1() + int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int bcf_write1(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) + int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + void bcf_destroy1(bcf1_t *v) + void bcf_empty1(bcf1_t *v) + int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) + void bcf_clear1(bcf1_t *v) + int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) - # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) + bcf_hdr_t *bcf_hdr_init(const char *mode) + void bcf_hdr_destroy(bcf_hdr_t *h) bcf1_t *bcf_init() - - # Deallocate a bcf1_t object void bcf_destroy(bcf1_t *v) - - # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, - # not the bcf1_t object itself. void bcf_empty(bcf1_t *v) - - # Make the bcf1_t object ready for next read. Intended mostly for - # internal use, the user should rarely need to call this function - # directly. void bcf_clear(bcf1_t *v) - # Reads VCF or BCF header - bcf_hdr_t *bcf_hdr_read(htsFile *fp) + ctypedef htsFile vcfFile + vcfFile *bcf_open(const char *fn, const char *mode) + vcfFile *vcf_open(const char *fn, const char *mode) + int bcf_flush(vcfFile *fp) + int bcf_close(vcfFile *fp) + int vcf_close(vcfFile *fp) - # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed - # @samples: samples to include or exclude from file or as a comma-separated string. - # LIST|FILE .. select samples in list/file - # ^LIST|FILE .. exclude samples from list/file - # - .. include all samples - # NULL .. exclude all samples - # @is_file: @samples is a file (1) or a comma-separated list (0) - # - # The bottleneck of VCF reading is parsing of genotype fields. If the - # reader knows in advance that only subset of samples is needed (possibly - # no samples at all), the performance of bcf_read() can be significantly - # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). - # The function bcf_read() will subset the VCF/BCF records automatically - # with the notable exception when reading records via bcf_itr_next(). - # In this case, bcf_subset_format() must be called explicitly, because - # bcf_readrec() does not see the header. - # - # Returns 0 on success, -1 on error or a positive integer if the list - # contains samples not present in the VCF header. In such a case, the - # return value is the index of the offending sample. - # + bcf_hdr_t *bcf_hdr_read(htsFile *fp) int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) - - # Writes VCF or BCF header int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) - # Parse VCF line contained in kstring and populate the bcf1_t struct int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) - - # The opposite of vcf_parse. It should rarely be called directly, see vcf_write + int vcf_open_mode(char *mode, const char *fn, const char *format) int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) - - # bcf_read() - read next VCF or BCF record - # - # Returns -1 on critical errors, 0 otherwise. On errors which are not - # critical for reading, such as missing header definitions, v->errcode is - # set to one of BCF_ERR* code and must be checked before calling - # vcf_write(). int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) - # - # Note that bcf_unpack() must be called even when reading VCF. It is safe - # to call the function repeatedly, it will not unpack the same field - # twice. - uint8_t BCF_UN_STR # up to ALT inclusive - uint8_t BCF_UN_FLT # up to FILTER - uint8_t BCF_UN_INFO # up to INFO - uint8_t BCF_UN_SHR # all shared information - uint8_t BCF_UN_FMT # unpack format and each sample - uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT - uint8_t BCF_UN_ALL # everything + uint8_t BCF_UN_STR + uint8_t BCF_UN_FLT + uint8_t BCF_UN_INFO + uint8_t BCF_UN_SHR + uint8_t BCF_UN_FMT + uint8_t BCF_UN_IND + uint8_t BCF_UN_ALL int bcf_unpack(bcf1_t *b, int which) - # bcf_dup() - create a copy of BCF record. - # - # Note that bcf_unpack() must be called on the returned copy as if it was - # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) - # internally to reflect any changes made by bcf_update_* functions. bcf1_t *bcf_dup(bcf1_t *src) bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) - - # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) - # The following functions work only with VCFs and should rarely be called - # directly. Usually one wants to use their bcf_* alternatives, which work - # transparently with both VCFs and BCFs. bcf_hdr_t *vcf_hdr_read(htsFile *fp) int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_write_line(htsFile *fp, kstring_t *line) - #************************************************************************ - # Header querying and manipulation routines - #************************************************************************ - - # Create a new header using the supplied template bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) - - # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). - # Returns 0 on success or sets a bit on error: - # 1 .. conflicting definitions of tag length - # # todo - int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) - - # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() - # @param dst: the destination header to be merged into, NULL on the first pass - # @param src: the source header - # - # Notes: - # - use as: - # bcf_hdr_t *dst = NULL; - # for (i=0; i 0 ) - # for (i=0; i=0 - # - # The returned values are: - # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* - # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields - # bcf_hdr_id2type .. the field type, one of BCF_HT_* - # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* - # - # Notes: Prior to using the macros, the presence of the info should be - # tested with bcf_hdr_idinfo_exists(). - # + const char *bcf_seqname(const bcf_hdr_t *hdr, const bcf1_t *rec) + const char *bcf_seqname_safe(const bcf_hdr_t *hdr, const bcf1_t *rec) + int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) - bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) + bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int dict_type, int col_type, int int_id) - void bcf_fmt_array(kstring_t *s, int n, int type, void *data) + int bcf_fmt_array(kstring_t *s, int n, int type, void *data) uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) - void bcf_enc_vchar(kstring_t *s, int l, const char *a) - void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) - void bcf_enc_vfloat(kstring_t *s, int n, float *a) + int bcf_enc_vchar(kstring_t *s, int l, const char *a) + int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) + int bcf_enc_vfloat(kstring_t *s, int n, float *a) - #************************************************************************ - # BCF index - # - # Note that these functions work with BCFs only. See synced_bcf_reader.h - # which provides (amongst other things) an API to work transparently with - # both indexed BCFs and VCFs. - #************************************************************************ + void bcf_itr_destroy(hts_itr_t *iter) + hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) + hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, const char *s) + int bcf_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) + hts_idx_t *bcf_index_load(const char *fn) + const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) int bcf_index_build(const char *fn, int min_shift) int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) + int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads) - #******************* - # Typed value I/O * - #****************** - - # Note that in contrast with BCFv2.1 specification, HTSlib implementation - # allows missing values in vectors. For integer types, the values 0x80, - # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, - # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of - # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an - # end-of-vector indicator. - # Note that the end-of-vector byte is not part of the vector. - - # This trial BCF version (v2.2) is compatible with the VCF specification and - # enables to handle correctly vectors with different ploidy in presence of - # missing values. + int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) + int bcf_idx_save(htsFile *fp) int32_t bcf_int8_vector_end int32_t bcf_int16_vector_end int32_t bcf_int32_vector_end + int64_t bcf_int64_vector_end int32_t bcf_str_vector_end int32_t bcf_int8_missing int32_t bcf_int16_missing int32_t bcf_int32_missing + int64_t bcf_int64_missing int32_t bcf_str_missing + int8_t BCF_MAX_BT_INT8 + int16_t BCF_MAX_BT_INT16 + int32_t BCF_MAX_BT_INT32 + int8_t BCF_MIN_BT_INT8 + int16_t BCF_MIN_BT_INT16 + int32_t BCF_MIN_BT_INT32 + uint32_t bcf_float_vector_end uint32_t bcf_float_missing void bcf_float_set(float *ptr, uint32_t value) - void bcf_float_set_vector_end(float *x) - void bcf_float_set_missing(float *x) + void bcf_float_set_vector_end(float *x) # takes pointer whereas C macro takes variable itself + void bcf_float_set_missing(float *x) # takes pointer whereas C macro takes variable itself int bcf_float_is_missing(float f) int bcf_float_is_vector_end(float f) - void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) - void bcf_enc_size(kstring_t *s, int size, int type) + + int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) + + int bcf_enc_size(kstring_t *s, int size, int type) int bcf_enc_inttype(long x) - void bcf_enc_int1(kstring_t *s, int32_t x) - int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) - int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) + int bcf_enc_int1(kstring_t *s, int32_t x) + int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) + int64_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) - # These trivial wrappers are defined only for consistency with other parts of htslib - bcf1_t *bcf_init1() - int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - void bcf_destroy1(bcf1_t *v) - void bcf_empty1(bcf1_t *v) - int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) - void bcf_clear1(bcf1_t *v) - int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) - # Other nice wrappers - void bcf_itr_destroy(hts_itr_t *iter) - hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) - int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) - hts_idx_t *bcf_index_load(const char *fn) - const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) +cdef extern from "htslib/vcfutils.h" nogil: + cdef struct kbitset_t -# VCF/BCF utility functions -cdef extern from "htslib/vcfutils.h" nogil: - struct kbitset_t - - # bcf_trim_alleles() - remove ALT alleles unused in genotype fields - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtain from vcf_parse1 - # - # Returns the number of removed alleles on success or negative - # on error: - # -1 .. some allele index is out of bounds int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) - - # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @mask: alleles to remove - # - # If you have more than 31 alleles, then the integer bit mask will - # overflow, so use bcf_remove_allele_set instead - void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) - - # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @rm_set: pointer to kbitset_t object with bits set for allele - # indexes to remove - # - # Number=A,R,G INFO and FORMAT fields will be updated accordingly. - void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) - - # bcf_calc_ac() - calculate the number of REF and ALT alleles - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @ac: array of length line->n_allele - # @which: determine if INFO/AN,AC and indv fields be used - # - # Returns 1 if the call succeeded, or 0 if the value could not - # be determined. - # - # The value of @which determines if existing INFO/AC,AN can be - # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT). + int bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) # deprecated + int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const kbitset_t *rm_set) int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) - # bcf_gt_type() - determines type of the genotype - # @fmt_ptr: the GT format field as set for example by set_fmt_ptr - # @isample: sample index (starting from 0) - # @ial: index of the 1st non-reference allele (starting from 1) - # @jal: index of the 2nd non-reference allele (starting from 1) - # - # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, - # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial - # is not NULL and the genotype has one or more non-reference - # alleles, $ial will be set. In case of GT_HET_AA, $ial is the - # position of the allele which appeared first in ALT. If $jal is - # not null and the genotype is GT_HET_AA, $jal will be set and is - # the position of the second allele in ALT. - uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation + uint8_t GT_HOM_RR uint8_t GT_HOM_AA uint8_t GT_HET_RA uint8_t GT_HET_AA @@ -2248,461 +1511,9 @@ cdef extern from "htslib/vcfutils.h" nogil: int bcf_acgt2int(char c) char bcf_int2acgt(int i) - # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) - # @i,j: allele indexes, 0-based, i<=j - # Returns index to the Number=G diploid array uint32_t bcf_ij2G(uint32_t i, uint32_t j) -cdef extern from "htslib/cram.h" nogil: - - enum cram_block_method: - ERROR - RAW - GZIP - BZIP2 - LZMA - RANS - RANS0 - RANS1 - GZIP_RLE - - enum cram_content_type: - CT_ERROR - FILE_HEADER - COMPRESSION_HEADER - MAPPED_SLICE - UNMAPPED_SLICE - EXTERNAL - CORE - - # Opaque data types, see cram_structs for the fully fledged versions. - ctypedef struct SAM_hdr - ctypedef struct cram_file_def - ctypedef struct cram_fd - ctypedef struct cram_container - ctypedef struct cram_block - ctypedef struct cram_slice - ctypedef struct cram_metrics - ctypedef struct cram_block_slice_hdr - ctypedef struct cram_block_compression_hdr - ctypedef struct refs_t - - # Accessor functions - - # - #----------------------------------------------------------------------------- - # cram_fd - # - SAM_hdr *cram_fd_get_header(cram_fd *fd) - void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) - - int cram_fd_get_version(cram_fd *fd) - void cram_fd_set_version(cram_fd *fd, int vers) - - int cram_major_vers(cram_fd *fd) - int cram_minor_vers(cram_fd *fd) - - hFILE *cram_fd_get_fp(cram_fd *fd) - void cram_fd_set_fp(cram_fd *fd, hFILE *fp) - - # - #----------------------------------------------------------------------------- - # cram_container - # - int32_t cram_container_get_length(cram_container *c) - void cram_container_set_length(cram_container *c, int32_t length) - int32_t cram_container_get_num_blocks(cram_container *c) - void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) - int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) - void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, - int32_t *landmarks) - - # Returns true if the container is empty (EOF marker) */ - int cram_container_is_empty(cram_fd *fd) - - - # - #----------------------------------------------------------------------------- - # cram_block - # - int32_t cram_block_get_content_id(cram_block *b) - int32_t cram_block_get_comp_size(cram_block *b) - int32_t cram_block_get_uncomp_size(cram_block *b) - int32_t cram_block_get_crc32(cram_block *b) - void * cram_block_get_data(cram_block *b) - - cram_content_type cram_block_get_content_type(cram_block *b) - - void cram_block_set_content_id(cram_block *b, int32_t id) - void cram_block_set_comp_size(cram_block *b, int32_t size) - void cram_block_set_uncomp_size(cram_block *b, int32_t size) - void cram_block_set_crc32(cram_block *b, int32_t crc) - void cram_block_set_data(cram_block *b, void *data) - - int cram_block_append(cram_block *b, void *data, int size) - void cram_block_update_size(cram_block *b) - - # Offset is known as "size" internally, but it can be confusing. - size_t cram_block_get_offset(cram_block *b) - void cram_block_set_offset(cram_block *b, size_t offset) - - # - # Computes the size of a cram block, including the block - # header itself. - # - uint32_t cram_block_size(cram_block *b) - - # - # Renumbers RG numbers in a cram compression header. - # - # CRAM stores RG as the Nth number in the header, rather than a - # string holding the ID: tag. This is smaller in space, but means - # "samtools cat" to join files together that contain single but - # different RG lines needs a way of renumbering them. - # - # The file descriptor is expected to be immediately after the - # cram_container structure (ie before the cram compression header). - # Due to the nature of the CRAM format, this needs to read and write - # the blocks itself. Note that there may be multiple slices within - # the container, meaning multiple compression headers to manipulate. - # Changing RG may change the size of the compression header and - # therefore the length field in the container. Hence we rewrite all - # blocks just in case and also emit the adjusted container. - # - # The current implementation can only cope with renumbering a single - # RG (and only then if it is using HUFFMAN or BETA codecs). In - # theory it *may* be possible to renumber multiple RGs if they use - # HUFFMAN to the CORE block or use an external block unshared by any - # other data series. So we have an API that can be upgraded to - # support this, but do not implement it for now. An example - # implementation of RG as an EXTERNAL block would be to find that - # block and rewrite it, returning the number of blocks consumed. - # - # Returns 0 on success; - # -1 if unable to edit; - # -2 on other errors (eg I/O). - # - int cram_transcode_rg(cram_fd *input, cram_fd *output, - cram_container *c, - int nrg, int *in_rg, int *out_rg) - - # - # Copies the blocks representing the next num_slice slices from a - # container from 'in' to 'out'. It is expected that the file pointer - # is just after the read of the cram_container and cram compression - # header. - # - # Returns 0 on success - # -1 on failure - # - int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) - - # - #----------------------------------------------------------------------------- - # SAM_hdr - # - - # Tokenises a SAM header into a hash table. - # - # Also extracts a few bits on specific data types, such as @RG lines. - # - # @return - # Returns a SAM_hdr struct on success (free with sam_hdr_free()) - # NULL on failure - # - SAM_hdr *sam_hdr_parse_(const char *hdr, int len) - - - # - #----------------------------------------------------------------------------- - # cram_io basics - # - - # CRAM blocks - the dynamically growable data block. We have code to - # create, update, (un)compress and read/write. - # - # These are derived from the deflate_interlaced.c blocks, but with the - # CRAM extension of content types and IDs. - # - - # Allocates a new cram_block structure with a specified content_type and - # id. - # - # @return - # Returns block pointer on success; - # NULL on failure - # - cram_block *cram_new_block(cram_content_type content_type, - int content_id) - - # Reads a block from a cram file. - # - # @return - # Returns cram_block pointer on success; - # NULL on failure - # - cram_block *cram_read_block(cram_fd *fd) - - # Writes a CRAM block. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_write_block(cram_fd *fd, cram_block *b) - - # Frees a CRAM block, deallocating internal data too. - # - void cram_free_block(cram_block *b) - - # Uncompresses a CRAM block, if compressed. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_uncompress_block(cram_block *b) - - # Compresses a block. - # - # Compresses a block using one of two different zlib strategies. If we only - # want one choice set strat2 to be -1. - # - # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED - # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is - # significantly faster. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int method, int level) - - # Containers - # - - # Creates a new container, specifying the maximum number of slices - # and records permitted. - # - # @return - # Returns cram_container ptr on success; - # NULL on failure - # - cram_container *cram_new_container(int nrec, int nslice) - void cram_free_container(cram_container *c) - - # Reads a container header. - # - # @return - # Returns cram_container on success; - # NULL on failure or no container left (fd->err == 0). - # - cram_container *cram_read_container(cram_fd *fd) - - # Writes a container structure. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_write_container(cram_fd *fd, cram_container *h) - - # - # Stores the container structure in dat and returns *size as the - # number of bytes written to dat[]. The input size of dat is also - # held in *size and should be initialised to cram_container_size(c). - # - # Returns 0 on success; - # -1 on failure - # - int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) - - int cram_container_size(cram_container *c) - - # The top-level cram opening, closing and option handling - # - - # Opens a CRAM file for read (mode "rb") or write ("wb"). - # - # The filename may be "-" to indicate stdin or stdout. - # - # @return - # Returns file handle on success; - # NULL on failure. - # - cram_fd *cram_open(const char *filename, const char *mode) - - # Opens an existing stream for reading or writing. - # - # @return - # Returns file handle on success; - # NULL on failure. - # - cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) - - # Closes a CRAM file. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_close(cram_fd *fd) - - # - # Seek within a CRAM file. - # - # Returns 0 on success - # -1 on failure - # - int cram_seek(cram_fd *fd, off_t offset, int whence) - - # - # Flushes a CRAM file. - # Useful for when writing to stdout without wishing to close the stream. - # - # Returns 0 on success - # -1 on failure - # - int cram_flush(cram_fd *fd) - - # Checks for end of file on a cram_fd stream. - # - # @return - # Returns 0 if not at end of file - # 1 if we hit an expected EOF (end of range or EOF block) - # 2 for other EOF (end of stream without EOF block) - # - int cram_eof(cram_fd *fd) - - # Sets options on the cram_fd. - # - # See CRAM_OPT_* definitions in hts.h. - # Use this immediately after opening. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) - - # Sets options on the cram_fd. - # - # See CRAM_OPT_* definitions in hts.h. - # Use this immediately after opening. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) - - # - # Attaches a header to a cram_fd. - # - # This should be used when creating a new cram_fd for writing where - # we have an SAM_hdr already constructed (eg from a file we've read - # in). - # - # @return - # Returns 0 on success; - # -1 on failure - # - int cram_set_header(cram_fd *fd, SAM_hdr *hdr) - - # Check if this file has a proper EOF block - # - # @return - # Returns 3 if the file is a version of CRAM that does not contain EOF blocks - # 2 if the file is a stream and thus unseekable - # 1 if the file contains an EOF block - # 0 if the file does not contain an EOF block - # -1 if an error occurred whilst reading the file or we could not seek back to where we were - # - # - int cram_check_EOF(cram_fd *fd) - - # As int32_decoded/encode, but from/to blocks instead of cram_fd */ - int int32_put_blk(cram_block *b, int32_t val) - - # Deallocates all storage used by a SAM_hdr struct. - # - # This also decrements the header reference count. If after decrementing - # it is still non-zero then the header is assumed to be in use by another - # caller and the free is not done. - # - # This is a synonym for sam_hdr_dec_ref(). - # - void sam_hdr_free(SAM_hdr *hdr) - - # Returns the current length of the SAM_hdr in text form. - # - # Call sam_hdr_rebuild() first if editing has taken place. - # - int sam_hdr_length(SAM_hdr *hdr) - - # Returns the string form of the SAM_hdr. - # - # Call sam_hdr_rebuild() first if editing has taken place. - # - char *sam_hdr_str(SAM_hdr *hdr) - - # Appends a formatted line to an existing SAM header. - # - # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - # optional new-line. If it contains more than 1 line then multiple lines - # will be added in order. - # - # Len is the length of the text data, or 0 if unknown (in which case - # it should be null terminated). - # - # @return - # Returns 0 on success; - # -1 on failure - # - - # Add an @PG line. - # - # If we wish complete control over this use sam_hdr_add() directly. This - # function uses that, but attempts to do a lot of tedious house work for - # you too. - # - # - It will generate a suitable ID if the supplied one clashes. - # - It will generate multiple @PG records if we have multiple PG chains. - # - # Call it as per sam_hdr_add() with a series of key,value pairs ending - # in NULL. - # - # @return - # Returns 0 on success; - # -1 on failure - # - int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) - - # - # A function to help with construction of CL tags in @PG records. - # Takes an argc, argv pair and returns a single space-separated string. - # This string should be deallocated by the calling function. - # - # @return - # Returns malloced char * on success; - # NULL on failure - # - char *stringify_argv(int argc, char *argv[]) - - # - # Returns the refs_t structure used by a cram file handle. - # - # This may be used in conjunction with option CRAM_OPT_SHARED_REF to - # share reference memory between multiple file handles. - # - # @return - # Returns NULL if none exists or the file handle is not a CRAM file. - # - refs_t *cram_get_refs(htsFile *fd) - - cdef class HTSFile(object): cdef htsFile *htsfile # pointer to htsFile structure cdef int64_t start_offset # BGZF offset of first record @@ -2714,6 +1525,6 @@ cdef class HTSFile(object): cdef readonly bint is_stream # Is htsfile a non-seekable stream cdef readonly bint is_remote # Is htsfile a remote stream - cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh + cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh cdef htsFile *_open_htsfile(self) except? NULL