00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: intfeaturemap.h 00005 // Description: Encapsulation of IntFeatureSpace with IndexMapBiDi 00006 // to provide a subspace mapping and fast feature lookup. 00007 // Created: Tue Oct 26 08:58:30 PDT 2010 00008 // 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H__ 00022 #define TESSERACT_CLASSIFY_INTFEATUREMAP_H__ 00023 00024 #include "intfeaturespace.h" 00025 #include "indexmapbidi.h" 00026 #include "intproto.h" 00027 00028 namespace tesseract { 00029 00030 class SampleIterator; 00031 00032 // Number of positive and negative offset maps. 00033 static const int kNumOffsetMaps = 2; 00034 00035 // Class to map a feature space defined by INT_FEATURE_STRUCT to a compact 00036 // down-sampled subspace of actually used features. 00037 // The IntFeatureMap copes with 2 stages of transformation: 00038 // The first step is down-sampling (re-quantization) and converting to a 00039 // single index value from the 3-D input: 00040 // INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and 00041 // the second is a feature-space compaction to map only the feature indices 00042 // that are actually used. This saves space in classifiers that are built 00043 // using the mapped feature space. 00044 // index (sparse) feature <-> map (compact) feature via IndexMapBiDi. 00045 // Although the transformations are reversible, the inverses are lossy and do 00046 // not return the exact input INT_FEATURE_STRUCT, due to the many->one nature 00047 // of both transformations. 00048 class IntFeatureMap { 00049 public: 00050 IntFeatureMap(); 00051 ~IntFeatureMap(); 00052 00053 // Accessors. 00054 int sparse_size() const { 00055 return feature_space_.Size(); 00056 } 00057 int compact_size() const { 00058 return compact_size_; 00059 } 00060 const IntFeatureSpace& feature_space() const { 00061 return feature_space_; 00062 } 00063 const IndexMapBiDi& feature_map() const { 00064 return feature_map_; 00065 } 00066 00067 // Pseudo-accessors. 00068 int IndexFeature(const INT_FEATURE_STRUCT& f) const; 00069 int MapFeature(const INT_FEATURE_STRUCT& f) const; 00070 int MapIndexFeature(int index_feature) const; 00071 INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const; 00072 INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const; 00073 void DeleteMapFeature(int map_feature); 00074 bool IsMapFeatureDeleted(int map_feature) const; 00075 00076 // Copies the given feature_space and uses it as the index feature map 00077 // from INT_FEATURE_STRUCT. 00078 void Init(const IntFeatureSpace& feature_space); 00079 00080 // Helper to return an offset index feature. In this context an offset 00081 // feature with a dir of +/-1 is a feature of a similar direction, 00082 // but shifted perpendicular to the direction of the feature. An offset 00083 // feature with a dir of +/-2 is feature at the same position, but rotated 00084 // by +/- one [compact] quantum. Returns the index of the generated offset 00085 // feature, or -1 if it doesn't exist. Dir should be in 00086 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction. 00087 // A dir of 0 is an identity transformation. 00088 // Both input and output are from the index(sparse) feature space, not 00089 // the mapped/compact feature space, but the offset feature is the minimum 00090 // distance moved from the input to guarantee that it maps to the next 00091 // available quantum in the mapped/compact space. 00092 int OffsetFeature(int index_feature, int dir) const; 00093 00094 // Computes the features used by the subset of samples defined by 00095 // the iterator and sets up the feature mapping. 00096 // Returns the size of the compacted feature space. 00097 int FindNZFeatureMapping(SampleIterator* it); 00098 00099 // After deleting some features, finish setting up the mapping, and map 00100 // all the samples. Returns the size of the compacted feature space. 00101 int FinalizeMapping(SampleIterator* it); 00102 00103 // Indexes the given array of features to a vector of sorted indices. 00104 void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features, 00105 int num_features, 00106 GenericVector<int>* sorted_features) const { 00107 feature_space_.IndexAndSortFeatures(features, num_features, 00108 sorted_features); 00109 } 00110 // Maps the given array of index/sparse features to an array of map/compact 00111 // features. 00112 // Assumes the input is sorted. The output indices are sorted and uniqued. 00113 // Returns the number of "missed" features, being features that 00114 // don't map to the compact feature space. 00115 int MapIndexedFeatures(const GenericVector<int>& index_features, 00116 GenericVector<int>* map_features) const { 00117 return feature_map_.MapFeatures(index_features, map_features); 00118 } 00119 00120 // Prints the map features from the set in human-readable form. 00121 void DebugMapFeatures(const GenericVector<int>& map_features) const; 00122 00123 private: 00124 void Clear(); 00125 00126 // Helper to compute an offset index feature. In this context an offset 00127 // feature with a dir of +/-1 is a feature of a similar direction, 00128 // but shifted perpendicular to the direction of the feature. An offset 00129 // feature with a dir of +/-2 is feature at the same position, but rotated 00130 // by +/- one [compact] quantum. Returns the index of the generated offset 00131 // feature, or -1 if it doesn't exist. Dir should be in 00132 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction. 00133 // A dir of 0 is an identity transformation. 00134 // Both input and output are from the index(sparse) feature space, not 00135 // the mapped/compact feature space, but the offset feature is the minimum 00136 // distance moved from the input to guarantee that it maps to the next 00137 // available quantum in the mapped/compact space. 00138 int ComputeOffsetFeature(int index_feature, int dir) const; 00139 00140 // True if the mapping has changed since it was last finalized. 00141 bool mapping_changed_; 00142 // Size of the compacted feature space, after unused features are removed. 00143 int compact_size_; 00144 // Feature space quantization definition and indexing from INT_FEATURE_STRUCT. 00145 IntFeatureSpace feature_space_; 00146 // Mapping from indexed feature space to the compacted space with unused 00147 // features mapping to -1. 00148 IndexMapBiDi feature_map_; 00149 // Index tables to map a feature index to the corresponding feature after a 00150 // shift perpendicular to the feature direction, or a rotation in place. 00151 // An entry of -1 indicates that there is no corresponding feature. 00152 // Array of arrays of size feature_space_.Size() owned by this class. 00153 int* offset_plus_[kNumOffsetMaps]; 00154 int* offset_minus_[kNumOffsetMaps]; 00155 00156 // Don't use default copy and assign! 00157 IntFeatureMap(const IntFeatureMap&); 00158 void operator=(const IntFeatureMap&); 00159 }; 00160 00161 } // namespace tesseract. 00162 00163 #endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H__
 1.6.3
 1.6.3