LDA++
NumpyFormat.hpp
1 #ifndef _LDAPLUSPLUS_NUMPY_FORMAT_HPP_
2 #define _LDAPLUSPLUS_NUMPY_FORMAT_HPP_
3 
4 
5 #include <cstdint>
6 #include <fstream>
7 #include <memory>
8 #include <string>
9 #include <vector>
10 
11 #include <Eigen/Core>
12 
13 namespace ldaplusplus {
14 namespace numpy_format {
15  template <typename Scalar>
16  inline const char * dtype_for_scalar() {
17  return "object";
18  }
19 
20 
21  // These specializations only work if the machine you are going to be
22  // reading the file from has the same endianess
23  template<>
24  inline const char * dtype_for_scalar<int8_t>() { return "i1"; }
25  template<>
26  inline const char * dtype_for_scalar<int16_t>() { return "i2"; }
27  template<>
28  inline const char * dtype_for_scalar<int32_t>() { return "i4"; }
29  template<>
30  inline const char * dtype_for_scalar<int64_t>() { return "i8"; }
31  template<>
32  inline const char * dtype_for_scalar<uint8_t>() { return "u1"; }
33  template<>
34  inline const char * dtype_for_scalar<uint16_t>() { return "u2"; }
35  template<>
36  inline const char * dtype_for_scalar<uint32_t>() { return "u4"; }
37  template<>
38  inline const char * dtype_for_scalar<uint64_t>() { return "u8"; }
39  template<>
40  inline const char * dtype_for_scalar<float>() { return "f4"; }
41  template<>
42  inline const char * dtype_for_scalar<double>() { return "f8"; }
43 
44 
45  inline bool is_big_endian() {
46  union ByteOrder
47  {
48  int32_t i;
49  uint8_t c[4];
50  };
51  ByteOrder b = {0x01234567};
52 
53  return b.c[0] == 0x01;
54  }
55 
56 
57  template <typename Scalar>
58  void swap_endianess(Scalar * data, size_t N) {
59  union D
60  {
61  Scalar v;
62  uint8_t c[sizeof(Scalar)];
63  };
64 
65  D d;
66  for (size_t i=0; i<N; i++) {
67  d.v = data[i];
68 
69  for (size_t j=0; j<sizeof(Scalar)/2; j++) {
70  std::swap(d.c[j], d.c[sizeof(Scalar)-j-1]);
71  }
72  }
73  }
74 
75 
85  template <typename Scalar>
87  {
88  public:
97  const Scalar *data,
98  std::vector<size_t> shape,
99  bool fortran
100  ) : data_(data),
101  shape_(shape),
102  fortran_(fortran)
103  {}
104 
110  template <int Rows, int Cols, int Options>
111  NumpyOutput(const Eigen::Matrix<Scalar, Rows, Cols, Options> &matrix)
112  : NumpyOutput(
113  matrix.data(),
114  std::vector<size_t>{
115  static_cast<size_t>(matrix.rows()),
116  static_cast<size_t>(matrix.cols())
117  },
118  ! Eigen::Matrix<Scalar, Rows, Cols, Options>::IsRowMajor
119  )
120  {}
121 
125  std::string dtype() const {
126  return std::string(is_big_endian() ? ">" : "<") +
127  dtype_for_scalar<Scalar>();
128  }
129 
133  const char * data() const { return reinterpret_cast<const char *>(data_); }
134 
139  const std::vector<size_t> & shape() const { return shape_; }
140 
145  bool fortran_contiguous() const { return fortran_; }
146 
147  private:
152  friend std::ostream & operator<<(std::ostream &os, const NumpyOutput &data) {
153  const uint8_t MAGIC_AND_VERSION[] = {
154  0x93, 0x4e, 0x55, 0x4d, 0x50, 0x59, 0x01, 0x00
155  };
156 
157  // We need to create the header for the data which is a python literal
158  // string
159  std::ostringstream header;
160  header << "{'descr': '" << data.dtype() << "',"
161  << " 'fortran_order': " << (data.fortran_contiguous() ? "True" : "False") << ","
162  << " 'shape': (";
163  for (auto d : data.shape()) {
164  header << d << ", ";
165  }
166  header << ")}";
167 
168  // Now we need to pad it with spaces until the total size of the magic +
169  // version + header_len + header is divisible by 16
170  while ((static_cast<size_t>(header.tellp()) + 11) % 16) {
171  header << " ";
172  }
173  header << "\n";
174 
175  // Now we need to write the magic and version
176  os.write(reinterpret_cast<const char *>(MAGIC_AND_VERSION), 8);
177 
178  // Write the header length in little endian no matter what
179  uint16_t header_len = header.tellp();
180  if (!is_big_endian()) {
181  os.write(reinterpret_cast<const char *>(&header_len), 2);
182  } else {
183  os.write(reinterpret_cast<const char *>(&header_len) + 1, 1);
184  os.write(reinterpret_cast<const char *>(&header_len), 1);
185  }
186 
187  // Write the header
188  os << header.str();
189 
190  // Finally write the data
191  size_t N = 1;
192  for (auto d : data.shape()) {
193  N *= d;
194  }
195  os.write(data.data(), N*sizeof(Scalar));
196 
197  return os;
198  }
199 
200  const Scalar * data_;
201  std::vector<size_t> shape_;
202  bool fortran_;
203  };
204 
205 
216  template <typename Scalar>
218  {
219  public:
220  NumpyInput() {}
221 
222  const bool fortran_contiguous() const { return fortran_; }
223  const Scalar * data() const { return data_.data(); }
224  const std::vector<size_t> & shape() const { return shape_; }
225 
234  template <int Rows, int Cols, int Options>
235  operator Eigen::Matrix<Scalar, Rows, Cols, Options>() const {
236  // get the needed rows, cols
237  int rows = shape_[0];
238  int cols = 1;
239  for (size_t i=1; i<shape_.size(); i++) {
240  cols *= shape_[i];
241  }
242 
243  Eigen::Matrix<Scalar, Rows, Cols, Options> matrix(
244  rows,
245  cols
246  );
247 
248  // copy the data by hand for simplicity
249  for (int i=0; i<cols; i++) {
250  for (int j=0; j<rows; j++) {
251  int idx = (fortran_) ? i*rows + j : j*cols + i;
252 
253  matrix(j, i) = data_[idx];
254  }
255  }
256 
257  // now return the object and let c++11 move semantics make it a
258  // cost free return by value
259  return matrix;
260  }
261 
262  private:
272  friend std::istream & operator>>(std::istream &is, NumpyInput &data) {
273  // reset the NumpyInput instance
274  data.data_.clear();
275  data.shape_.clear();
276 
277  // read the magic and the version and assert that it is compatible
278  char MAGIC_AND_VERSION[8];
279  is.read(MAGIC_AND_VERSION, 8);
280  if (MAGIC_AND_VERSION[6] > 1) {
281  throw std::runtime_error(
282  "Only version 1 of the numpy format is supported"
283  );
284  }
285 
286  // if the file is empty (aka we read nothing) just throw a
287  // runtime error
288  if (is.gcount() == 0) {
289  throw std::runtime_error(
290  "The file is empty and cannot be read"
291  );
292  }
293 
294  // read the header len
295  uint16_t header_len;
296  is.read(reinterpret_cast<char *>(&header_len), 2);
297  if (is_big_endian()) {
298  swap_endianess(&header_len, 1);
299  }
300 
301  // read the header
302  std::vector<char> buffer(header_len+1);
303  is.read(&buffer[0], header_len);
304  buffer[header_len] = 0;
305  std::string header(&buffer[0]);
306 
307  // we can parse the header efficiently using the fact that the
308  // specification requires the dictionary to be passed by
309  // pprint.pformat()
310 
311  // parse dtype info
312  std::string dtype = header.substr(11, 3);
313  bool endianness = dtype[0] == '>';
314  if (dtype.substr(1) != dtype_for_scalar<Scalar>()) {
315  throw std::runtime_error(
316  std::string() +
317  "The type of the array is not the " +
318  "one requested: " + dtype.substr(1) +
319  " != " + dtype_for_scalar<Scalar>()
320  );
321  }
322 
323  // parse contiguity type
324  data.fortran_ = header[34] == 'T';
325 
326  // parse shape
327  std::string shape = header.substr(
328  header.find_last_of('(')+1,
329  header.find_last_of(')')
330  );
331  try {
332  while (true) {
333  size_t processed;
334  data.shape_.push_back(std::stoi(shape, &processed));
335 
336  // +2 to account for the comma and the space
337  shape = shape.substr(processed + 2);
338  }
339  } catch (const std::invalid_argument&) {
340  // that's ok it means we finished parsing the tuple
341  }
342 
343  // compute the total size of the data
344  int N = 1;
345  for (auto c : data.shape_) {
346  N *= c;
347  }
348 
349  // read the data
350  data.data_.resize(N);
351  is.read(reinterpret_cast<char *>(&data.data_[0]), N*sizeof(Scalar));
352 
353  // fix the endianess
354  if (endianness != is_big_endian()) {
355  swap_endianess(&data.data_[0], N);
356  }
357 
358  return is;
359  }
360 
361  std::vector<Scalar> data_;
362  std::vector<size_t> shape_;
363  bool fortran_;
364  };
365 
366 
370  template <typename Scalar, int Rows, int Cols, int Options>
371  void save(std::ostream &out_stream, const Eigen::Matrix<Scalar, Rows, Cols, Options> &matrix) {
372  out_stream << NumpyOutput<Scalar>(matrix);
373  }
374 
375 
379  template <typename Scalar, int Rows, int Cols, int Options>
380  void save(std::string save_path, const Eigen::Matrix<Scalar, Rows, Cols, Options> &matrix) {
381  std::fstream out_stream(
382  save_path,
383  std::ios::out | std::ios::binary
384  );
385 
386  save(out_stream, matrix);
387  }
388 
389 
393  template <
394  typename Scalar,
395  int Rows=Eigen::Dynamic,
396  int Cols=Eigen::Dynamic,
397  int Options=Eigen::ColMajor | Eigen::AutoAlign
398  >
399  Eigen::Matrix<Scalar, Rows, Cols, Options> load(std::istream &in_stream) {
401  in_stream >> ni;
402 
403  return ni;
404  }
405 
406 
410  template <
411  typename Scalar,
412  int Rows=Eigen::Dynamic,
413  int Cols=Eigen::Dynamic,
414  int Options=Eigen::ColMajor | Eigen::AutoAlign
415  >
416  Eigen::Matrix<Scalar, Rows, Cols, Options> load(std::string data_path) {
417  std::fstream in_stream(
418  data_path,
419  std::ios::in | std::ios::binary
420  );
421 
422  return load<Scalar>(in_stream);
423  }
424 
425 } // namespace numpy_format
426 
427 } // namespace ldaplusplus
428 #endif // _LDAPLUSPLUS_NUMPY_FORMAT_HPP_
friend std::ostream & operator<<(std::ostream &os, const NumpyOutput &data)
Definition: NumpyFormat.hpp:152
std::string dtype() const
Definition: NumpyFormat.hpp:125
NumpyOutput(const Scalar *data, std::vector< size_t > shape, bool fortran)
Definition: NumpyFormat.hpp:96
Definition: NumpyFormat.hpp:217
bool fortran_contiguous() const
Definition: NumpyFormat.hpp:145
const char * data() const
Definition: NumpyFormat.hpp:133
friend std::istream & operator>>(std::istream &is, NumpyInput &data)
Definition: NumpyFormat.hpp:272
const std::vector< size_t > & shape() const
Definition: NumpyFormat.hpp:139
Definition: NumpyFormat.hpp:86
NumpyOutput(const Eigen::Matrix< Scalar, Rows, Cols, Options > &matrix)
Definition: NumpyFormat.hpp:111
Definition: Document.hpp:11