BamTools  2.5.2
BamAlignment.h
Go to the documentation of this file.
1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
9 
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
12 
13 #include <cstddef>
14 #include <cstdlib>
15 #include <cstring>
16 #include <string>
17 #include <vector>
18 #include "api/BamAux.h"
19 #include "api/BamConstants.h"
20 #include "api/api_global.h"
21 
22 namespace BamTools {
23 
25 // forward declaration of BamAlignment's "friends"
26 namespace Internal {
27 class BamReaderPrivate;
28 class BamWriterPrivate;
29 } // namespace Internal
31 
32 // BamAlignment data structure
33 class API_EXPORT BamAlignment
34 {
35 
36  // constructors & destructor
37 public:
38  BamAlignment();
39 
40  // queries against alignment flags
41 public:
42  bool IsDuplicate() const; // returns true if this read is a PCR duplicate
43  bool IsFailedQC() const; // returns true if this read failed quality control
44  bool IsFirstMate() const; // returns true if alignment is first mate on read
45  bool IsMapped() const; // returns true if alignment is mapped
46  bool IsMateMapped() const; // returns true if alignment's mate is mapped
47  bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand
48  bool IsPaired() const; // returns true if alignment part of paired-end read
49  bool IsSupplementary() const; // returns true if this read is supplementary
50  bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment
51  bool IsProperPair()
52  const; // returns true if alignment is part of read that satisfied paired-end resolution
53  bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand
54  bool IsSecondMate() const; // returns true if alignment is second mate on read
55 
56  // manipulate alignment flags
57 public:
58  void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
59  void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
60  void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
61  void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
62  void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
63  void SetIsMateReverseStrand(
64  bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
65  void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
66  void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
67  void SetIsProperPair(
68  bool
69  ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
70  void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
71  void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
72 
73  // tag data access methods
74 public:
75  // add a new tag
76  template <typename T>
77  bool AddTag(const std::string& tag, const std::string& type, const T& value);
78  template <typename T>
79  bool AddTag(const std::string& tag, const std::vector<T>& values);
80 
81  // edit (or append) tag
82  template <typename T>
83  bool EditTag(const std::string& tag, const std::string& type, const T& value);
84  template <typename T>
85  bool EditTag(const std::string& tag, const std::vector<T>& values);
86 
87  // retrieves tag data
88  template <typename T>
89  bool GetTag(const std::string& tag, T& destination) const;
90  template <typename T>
91  bool GetTag(const std::string& tag, std::vector<T>& destination) const;
92 
93  // retrieves all current tag names
94  std::vector<std::string> GetTagNames() const;
95 
96  // retrieves the SAM/BAM type-code for requested tag name
97  bool GetTagType(const std::string& tag, char& type) const;
98 
99  // retrieves the SAM/BAM type-code for the data elements in an array tag
100  bool GetArrayTagType(const std::string& tag, char& type) const;
101 
102  // returns true if alignment has a record for this tag name
103  bool HasTag(const std::string& tag) const;
104 
105  // removes a tag
106  void RemoveTag(const std::string& tag);
107 
108  // additional methods
109 public:
110  // populates alignment string fields
111  bool BuildCharData();
112 
113  // calculates alignment end position
114  int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
115 
116  // returns a description of the last error that occurred
117  std::string GetErrorString() const;
118 
119  // retrieves the size, read locations and reference locations of soft-clip operations
120  bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
121  std::vector<int>& genomePositions, bool usePadded = false) const;
122 
123  // public data fields
124 public:
125  std::string Name; // read name
126  int32_t Length; // length of query sequence
127  std::string QueryBases; // 'original' sequence (contained in BAM file)
128  std::string
129  AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
130  std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
131  std::string TagData; // tag data (use provided methods to query/modify)
132  int32_t RefID; // ID number for reference sequence
133  int32_t Position; // position (0-based) where alignment starts
134  uint16_t Bin; // BAM (standard) index bin number for this alignment
135  uint16_t MapQuality; // mapping quality score
136  uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
137  std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
138  int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
139  int32_t MatePosition; // position (0-based) where alignment's mate starts
140  int32_t InsertSize; // mate-pair insert size
141  std::string Filename; // name of BAM file which this alignment comes from
142 
144  // internal utility methods
145 private:
146  bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength,
147  unsigned int& numBytesParsed) const;
148  bool IsValidSize(const std::string& tag, const std::string& type) const;
149  void SetErrorString(const std::string& where, const std::string& what) const;
150  bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const;
151 
153  // internal data
154 private:
155  struct BamAlignmentSupportData
156  {
157 
159  // data members
160  std::string AllCharData;
161  uint32_t BlockLength;
162  uint32_t NumCigarOperations;
163  uint32_t QueryNameLength;
164  uint32_t QuerySequenceLength;
165  bool HasCoreOnly;
166 
168  // constructor
169  BamAlignmentSupportData()
170  : BlockLength(0)
171  , NumCigarOperations(0)
172  , QueryNameLength(0)
173  , QuerySequenceLength(0)
174  , HasCoreOnly(false)
175  {}
176  };
177  BamAlignmentSupportData SupportData;
178  friend class Internal::BamReaderPrivate;
179  friend class Internal::BamWriterPrivate;
180 
181  mutable std::string ErrorString; // mutable to allow updates even in logically const methods
182 };
183 
184 // ---------------------------------------------------------
185 // BamAlignment tag access methods
186 
198 template <typename T>
199 bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value)
200 {
201 
202  // if char data not populated, do that first
203  if (SupportData.HasCoreOnly) {
204  BuildCharData();
205  }
206 
207  // check tag/type size
208  if (!IsValidSize(tag, type)) {
209  // TODO: set error string?
210  return false;
211  }
212 
213  // check that storage type code is OK for T
214  if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) {
215  // TODO: set error string?
216  return false;
217  }
218 
219  // localize the tag data
220  char* pTagData = (char*)TagData.data();
221  const unsigned int tagDataLength = TagData.size();
222  unsigned int numBytesParsed = 0;
223 
224  // if tag already exists, return false
225  // use EditTag explicitly instead
226  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
227  // TODO: set error string?
228  return false;
229  }
230 
231  // otherwise, convert value to string
232  union
233  {
234  T value;
235  char valueBuffer[sizeof(T)];
236  } un;
237  un.value = value;
238 
239  // copy original tag data to temp buffer
240  const std::string newTag = tag + type;
241  const std::size_t newTagDataLength =
242  tagDataLength + newTag.size() + sizeof(T); // leave room for new T
243  RaiiBuffer originalTagData(newTagDataLength);
244  std::memcpy(originalTagData.Buffer, TagData.c_str(),
245  tagDataLength + 1); // '+1' for TagData null-term
246 
247  // append newTag
248  std::strcat(originalTagData.Buffer + tagDataLength, newTag.data());
249  std::memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
250 
251  // store temp buffer back in TagData
252  const char* newTagData = (const char*)originalTagData.Buffer;
253  TagData.assign(newTagData, newTagDataLength);
254  return true;
255 }
256 
257 template <>
258 inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type,
259  const std::string& value)
260 {
261  // if char data not populated, do that first
262  if (SupportData.HasCoreOnly) {
263  BuildCharData();
264  }
265 
266  // check tag/type size
267  if (!IsValidSize(tag, type)) {
268  // TODO: set error string?
269  return false;
270  }
271 
272  // check that storage type code is OK for string
273  if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) {
274  // TODO: set error string?
275  return false;
276  }
277 
278  // localize the tag data
279  char* pTagData = (char*)TagData.data();
280  const unsigned int tagDataLength = TagData.size();
281  unsigned int numBytesParsed = 0;
282 
283  // if tag already exists, return false
284  // use EditTag explicitly instead
285  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
286  // TODO: set error string?
287  return false;
288  }
289 
290  // otherwise, copy tag data to temp buffer
291  const std::string newTag = tag + type + value;
292  const std::size_t newTagDataLength =
293  tagDataLength + newTag.size() + 1; // leave room for null-term
294  RaiiBuffer originalTagData(newTagDataLength);
295  std::memcpy(originalTagData.Buffer, TagData.c_str(),
296  tagDataLength + 1); // '+1' for TagData null-term
297 
298  // append newTag (removes original null-term, then appends newTag + null-term)
299  std::strcat(originalTagData.Buffer + tagDataLength, newTag.data());
300 
301  // store temp buffer back in TagData
302  const char* newTagData = (const char*)originalTagData.Buffer;
303  TagData.assign(newTagData, newTagDataLength);
304  return true;
305 }
306 
317 template <typename T>
318 bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values)
319 {
320 
321  // if char data not populated, do that first
322  if (SupportData.HasCoreOnly) {
323  BuildCharData();
324  }
325 
326  // check for valid tag name length
327  if (tag.size() != Constants::BAM_TAG_TAGSIZE) {
328  return false;
329  }
330 
331  // localize the tag data
332  char* pTagData = (char*)TagData.data();
333  const unsigned int tagDataLength = TagData.size();
334  unsigned int numBytesParsed = 0;
335 
336  // if tag already exists, return false
337  // use EditTag explicitly instead
338  if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
339  // TODO: set error string?
340  return false;
341  }
342 
343  // build new tag's base information
344  char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
345  std::memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE);
346  newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
347  newTagBase[3] = TagTypeHelper<T>::TypeCode();
348 
349  // add number of array elements to newTagBase
350  const int32_t numElements = values.size();
351  std::memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
352 
353  // copy current TagData string to temp buffer, leaving room for new tag's contents
354  const std::size_t newTagDataLength =
355  tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T);
356  RaiiBuffer originalTagData(newTagDataLength);
357  std::memcpy(originalTagData.Buffer, TagData.c_str(),
358  tagDataLength + 1); // '+1' for TagData's null-term
359 
360  // write newTagBase (removes old null term)
361  std::strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
362 
363  // add vector elements to tag
364  int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
365  for (int i = 0; i < numElements; ++i) {
366  const T& value = values.at(i);
367  std::memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value,
368  sizeof(T));
369  }
370 
371  // store temp buffer back in TagData
372  const char* newTagData = (const char*)originalTagData.Buffer;
373  TagData.assign(newTagData, newTagDataLength);
374  return true;
375 }
376 
391 template <typename T>
392 bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value)
393 {
394 
395  // if char data not populated, do that first
396  if (SupportData.HasCoreOnly) {
397  BuildCharData();
398  }
399 
400  // remove existing tag if present, then append tag with new value
401  if (HasTag(tag)) {
402  RemoveTag(tag);
403  }
404  return AddTag(tag, type, value);
405 }
406 
418 template <typename T>
419 bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values)
420 {
421 
422  // if char data not populated, do that first
423  if (SupportData.HasCoreOnly) {
424  BuildCharData();
425  }
426 
427  // remove existing tag if present, then append tag with new values
428  if (HasTag(tag)) {
429  RemoveTag(tag);
430  }
431  return AddTag(tag, values);
432 }
433 
441 template <typename T>
442 bool BamAlignment::GetTag(const std::string& tag, T& destination) const
443 {
444 
445  // skip if alignment is core-only
446  if (SupportData.HasCoreOnly) {
447  // TODO: set error string?
448  return false;
449  }
450 
451  // skip if no tags present
452  if (TagData.empty()) {
453  // TODO: set error string?
454  return false;
455  }
456 
457  // localize the tag data
458  char* pTagData = (char*)TagData.data();
459  const unsigned int tagDataLength = TagData.size();
460  unsigned int numBytesParsed = 0;
461 
462  // return failure if tag not found
463  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
464  // TODO: set error string?
465  return false;
466  }
467 
468  // fetch data type
469  const char type = *(pTagData - 1);
470  if (!TagTypeHelper<T>::CanConvertFrom(type)) {
471  // TODO: set error string ?
472  return false;
473  }
474 
475  // determine data length
476  int destinationLength = 0;
477  switch (type) {
478 
479  // 1 byte data
483  destinationLength = 1;
484  break;
485 
486  // 2 byte data
489  destinationLength = 2;
490  break;
491 
492  // 4 byte data
496  destinationLength = 4;
497  break;
498 
499  // var-length types not supported for numeric destination
503  SetErrorString("BamAlignment::GetTag",
504  "cannot store variable length tag data into a numeric destination");
505  return false;
506 
507  // unrecognized tag type
508  default:
509  const std::string message = std::string("invalid tag type: ") + type;
510  SetErrorString("BamAlignment::GetTag", message);
511  return false;
512  }
513 
514  // store data in destination
515  destination = 0;
516  std::memcpy(&destination, pTagData, destinationLength);
517 
518  // return success
519  return true;
520 }
521 
522 template <>
523 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
524  std::string& destination) const
525 {
526  // skip if alignment is core-only
527  if (SupportData.HasCoreOnly) {
528  // TODO: set error string?
529  return false;
530  }
531 
532  // skip if no tags present
533  if (TagData.empty()) {
534  // TODO: set error string?
535  return false;
536  }
537 
538  // localize the tag data
539  char* pTagData = (char*)TagData.data();
540  const unsigned int tagDataLength = TagData.size();
541  unsigned int numBytesParsed = 0;
542 
543  // return failure if tag not found
544  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
545  // TODO: set error string?
546  return false;
547  }
548 
549  // otherwise copy data into destination
550  const unsigned int dataLength = std::strlen(pTagData);
551  destination.clear();
552  destination.resize(dataLength);
553  std::memcpy((char*)destination.data(), pTagData, dataLength);
554 
555  // return success
556  return true;
557 }
558 
566 template <typename T>
567 bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const
568 {
569 
570  // skip if alignment is core-only
571  if (SupportData.HasCoreOnly) {
572  // TODO: set error string?
573  return false;
574  }
575 
576  // skip if no tags present
577  if (TagData.empty()) {
578  // TODO: set error string?
579  return false;
580  }
581 
582  // localize the tag data
583  char* pTagData = (char*)TagData.data();
584  const unsigned int tagDataLength = TagData.size();
585  unsigned int numBytesParsed = 0;
586 
587  // return false if tag not found
588  if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
589  // TODO: set error string?
590  return false;
591  }
592 
593  // check that tag is array type
594  const char tagType = *(pTagData - 1);
595  if (tagType != Constants::BAM_TAG_TYPE_ARRAY) {
596  SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
597  return false;
598  }
599 
600  // fetch element type
601  const char elementType = *pTagData;
602  if (!TagTypeHelper<T>::CanConvertFrom(elementType)) {
603  // TODO: set error string ?
604  return false;
605  }
606  ++pTagData;
607 
608  // calculate length of each element in tag's array
609  switch (elementType) {
613  break;
614 
617  break;
618 
622  break;
623 
624  // var-length types not supported for numeric destination
628  SetErrorString("BamAlignment::GetTag",
629  "invalid array data, variable-length elements are not allowed");
630  return false;
631 
632  // unknown tag type
633  default:
634  const std::string message = std::string("invalid array element type: ") + elementType;
635  SetErrorString("BamAlignment::GetTag", message);
636  return false;
637  }
638 
639  // get number of elements
640  int32_t numElements;
641  std::memcpy(&numElements, pTagData, sizeof(int32_t));
642  pTagData += 4;
643  destination.clear();
644  destination.reserve(numElements);
645 
646  // read in elements
647  T value;
648  for (int i = 0; i < numElements; ++i) {
649  std::memcpy(&value, pTagData, sizeof(T));
650  pTagData += sizeof(T);
651  destination.push_back(value);
652  }
653 
654  // return success
655  return true;
656 }
657 
658 typedef std::vector<BamAlignment> BamAlignmentVector;
659 
660 } // namespace BamTools
661 
662 #endif // BAMALIGNMENT_H
The main BAM alignment data structure.
Definition: BamAlignment.h:34
bool BuildCharData()
Populates alignment string fields (read name, bases, qualities, tag data).
Definition: BamAlignment.cpp:108
int32_t InsertSize
mate-pair insert size
Definition: BamAlignment.h:140
bool GetTag(const std::string &tag, T &destination) const
Definition: BamAlignment.h:442
int32_t Length
length of query sequence
Definition: BamAlignment.h:126
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition: BamAlignment.h:136
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:199
std::string AlignedBases
'aligned' sequence (includes any indels, padding, clipping)
Definition: BamAlignment.h:129
int32_t RefID
ID number for reference sequence.
Definition: BamAlignment.h:132
std::string Name
read name
Definition: BamAlignment.h:125
uint16_t MapQuality
mapping quality score
Definition: BamAlignment.h:135
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition: BamAlignment.h:130
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition: BamAlignment.h:134
int32_t MatePosition
position (0-based) where alignment's mate starts
Definition: BamAlignment.h:139
std::string TagData
tag data (use the provided methods to query/modify)
Definition: BamAlignment.h:131
std::string Filename
name of BAM file which this alignment comes from
Definition: BamAlignment.h:141
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition: BamAlignment.h:137
int32_t MateRefID
ID number for reference sequence where alignment's mate was aligned.
Definition: BamAlignment.h:138
bool HasTag(const std::string &tag) const
Returns true if alignment has a record for requested tag.
Definition: BamAlignment.cpp:723
int32_t Position
position (0-based) where alignment starts
Definition: BamAlignment.h:133
void RemoveTag(const std::string &tag)
Removes field from BAM tags.
Definition: BamAlignment.cpp:856
std::string QueryBases
'original' sequence (as reported from sequencing machine)
Definition: BamAlignment.h:127
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:392
const char BAM_TAG_TYPE_UINT8
Definition: BamConstants.h:76
const char BAM_TAG_TYPE_HEX
Definition: BamConstants.h:83
const char BAM_TAG_TYPE_INT32
Definition: BamConstants.h:79
const char BAM_TAG_TYPE_ASCII
Definition: BamConstants.h:74
const uint8_t BAM_TAG_TAGSIZE
Definition: BamConstants.h:86
const char BAM_TAG_TYPE_ARRAY
Definition: BamConstants.h:84
const char BAM_TAG_TYPE_FLOAT
Definition: BamConstants.h:81
const char BAM_TAG_TYPE_UINT32
Definition: BamConstants.h:80
const char BAM_TAG_TYPE_STRING
Definition: BamConstants.h:82
const char BAM_TAG_TYPE_INT8
Definition: BamConstants.h:75
const char BAM_TAG_TYPE_UINT16
Definition: BamConstants.h:78
const char BAM_TAG_TYPE_INT16
Definition: BamConstants.h:77
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition: BamConstants.h:88
Contains all BamTools classes & methods.
Definition: Sort.h:24
std::vector< BamAlignment > BamAlignmentVector
Definition: BamAlignment.h:658