source: branches/testFACT++branch/src/zfits.cc@ 19366

Last change on this file since 19366 was 14822, checked in by tbretz, 12 years ago
In the example, alignment on two samples is enough
File size: 12.6 KB
Line 
1#include <fstream>
2#include <iostream>
3#include <algorithm>
4#include <map>
5#include <vector>
6
7#include "Configuration.h"
8
9#include "externals/fits.h"
10#include "externals/huffman.h"
11
12#include "Time.h"
13
14using namespace std;
15
16void SetupConfiguration(Configuration &conf)
17{
18 po::options_description control("zfits");
19 control.add_options()
20 ("in", var<string>()->required(), "")
21 ("out", var<string>(), "")
22 ("decompress,d", po_switch(), "")
23 ("force,f", var<string>(), "Force overwrite of output file")
24 ;
25
26 po::positional_options_description p;
27 p.add("in", 1); // The 1st positional options
28 p.add("out", 2); // The 2nd positional options
29
30 conf.AddOptions(control);
31 conf.SetArgumentPositions(p);
32}
33
34void PrintUsage()
35{
36 cout <<
37 "zfits - A fits compressor\n"
38 "\n"
39 "\n"
40 "Usage: zfits [-d] input.fits[.gz] [output.zf]\n";
41 cout << endl;
42}
43
44
45string ReplaceEnd(const string &str, const string &expr, const string &repl)
46{
47 string out(str);
48
49 const size_t p = out.rfind(expr);
50 if (p==out.size()-expr.length())
51 out.replace(p, expr.length(), repl);
52
53 return out;
54}
55
56
57string ReplaceExt(const string &name, bool decomp)
58{
59 if (decomp)
60 return ReplaceEnd(name, ".zfits", ".fits");
61
62 string out = ReplaceEnd(name, ".fits", ".zfits");
63 return ReplaceEnd(out, ".fits.gz", ".zfits");
64}
65
66int Compress(const string &ifile, const string &ofile)
67{
68 // when to print some info on the screen (every f percent)
69 float frac = 0.01;
70
71 // open a fits file
72 fits f(ifile);
73
74 // open output file
75 ofstream fout(ofile);
76
77 // counters for total size and compressed size
78 uint64_t tot = 0;
79 uint64_t com = 0;
80
81 // very simple timer
82 double sec = 0;
83
84 // Produce a lookup table with all informations about the
85 // columns in the same order as they are in the file
86 const fits::Table::Columns &cols= f.GetColumns();
87
88 struct col_t : fits::Table::Column
89 {
90 string name;
91 void *ptr;
92 };
93
94
95 map<size_t, col_t> columns;
96
97 size_t row_tot = 0;
98 for (auto it=cols.begin(); it!=cols.end(); it++)
99 {
100 col_t c;
101
102 c.offset = it->second.offset;
103 c.size = it->second.size;
104 c.num = it->second.num;
105 c.name = it->first;
106 c.ptr = f.SetPtrAddress(it->first);
107
108 columns[c.offset] = c;
109
110 row_tot += c.size*c.num;
111 }
112
113 // copy the header from the input to the output file
114 // and prefix the output file as a compressed fits file
115 string header;
116 header.resize(f.tellg());
117
118 f.seekg(0);
119 f.read((char*)header.c_str(), header.size());
120
121 char m[2];
122 m[0] = 'z'+128;
123 m[1] = 'f'+128;
124
125 const size_t hlen = 0;
126
127 size_t hs = header.size();
128
129 fout.write(m, 2); // magic number
130 fout.write((char*)&hlen, sizeof(size_t)); // length of possible header data (e.g. file version, compression algorithm)
131 fout.write((char*)&hs, sizeof(size_t)); // size of FITS header
132 fout.write(header.c_str(), header.size()); // uncompressed FITS header
133
134 tot += header.size();
135 com += header.size()+2+2*sizeof(size_t);
136
137 cout << fixed;
138
139 Time start;
140
141 // loop over all rows
142 vector<char> cache(row_tot);
143 while (f.GetNextRow())
144 {
145 // pointer to the start of the cache for the data of one row
146 char *out = cache.data();
147
148 // mask stroing which column have been compressed and which not
149 vector<uint8_t> mask(cols.size()/8 + 1);
150
151 // loop over all columns
152 uint32_t icol = 0;
153 for (auto it=columns.begin(); it!=columns.end(); it++, icol++)
154 {
155 // size of cell in bytes
156 const size_t len_col = it->second.size * it->second.num;
157
158 // get pointer to data
159 int16_t *ptr = (int16_t*)it->second.ptr;
160
161 // If the column is the data, preprocess the data
162 /*
163 if (it->second.name=="Data")
164 {
165 int16_t *end = ptr+1440*300-4-(1440*300)%2;
166 int16_t *beg = ptr;
167
168 while (end>=beg)
169 {
170 const int16_t avg = (end[0] + end[1])/2;
171 end[2] -= avg;
172 end[3] -= avg;
173 end -=2;
174 }
175 }*/
176
177 // do not try to compress less than 32bytes
178 if (len_col>32 && it->second.size==2)
179 {
180 Time now;
181
182 // perform 16bit hoffman (option for 8bit missing, skip 64bit)
183 // (what to do with floats?)
184 string buf;
185 /*int len =*/ Huffman::Encode(buf, (uint16_t*)ptr, len_col/2);
186
187 sec += Time().UnixTime()-now.UnixTime();
188
189 // check if data was really compressed
190 if (buf.size()<len_col)
191 {
192 // copy compressed data into output cache
193 memcpy(out, buf.c_str(), buf.size());
194 out += buf.size();
195
196 // update mask
197 const uint64_t bit = (icol%8);
198 mask[icol/8] |= (1<<bit);
199
200 continue;
201 }
202 }
203
204 // just copy the data if it has not been compressed
205 memcpy(out, (char*)ptr, len_col);
206 out += len_col;
207 }
208
209 // calcualte size of output buffer
210 const size_t sz = out-cache.data();
211
212 // update counters
213 tot += row_tot;
214 com += sz + mask.size();
215
216 // write the compression mask and the (partly) copmpressed data stream
217 fout.write((char*)mask.data(), mask.size());
218 fout.write(cache.data(), sz);
219
220 //if (sz2<0 || memcmp(data, dest3.data(), 432000*2)!=0)
221 // cout << "grrrr" << endl;
222
223 const float proc = float(f.GetRow())/f.GetNumRows();
224 if (proc>frac)
225 {
226 const double elep = Time().UnixTime()-start.UnixTime();
227 cout << "\r" << setprecision(0) << setw(3) << 100*proc << "% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s in:" << tot/1000000/elep << "MB/s" << flush;
228 frac += 0.01;
229 }
230 }
231
232 const double elep = Time().UnixTime()-start.UnixTime();
233 cout << setprecision(0) << "\r100% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s in:" << tot/1000000/elep << "MB/s" << endl;
234
235 return 0;
236}
237
238template<size_t N>
239void revcpy(char *dest, const char *src, int num)
240{
241 const char *pend = src + num*N;
242 for (const char *ptr = src; ptr<pend; ptr+=N, dest+=N)
243 reverse_copy(ptr, ptr+N, dest);
244}
245
246int Decompress(const string &ifile, const string &ofile)
247{
248 // open a fits file
249 ifstream fin(ifile);
250
251 // open output file
252 ofstream fout(ofile);
253
254 // get and check magic number
255 unsigned char m[2];
256 fin.read((char*)m, 2);
257 if (m[0]!='z'+128 || m[1]!='f'+128)
258 throw runtime_error("File not a compressed fits file.");
259
260 // get length of additional header information
261 size_t hlen = 0;
262 fin.read((char*)&hlen, sizeof(size_t));
263 if (hlen>0)
264 throw runtime_error("Only Version-zero files supported.");
265
266 // get size of FITS header
267 size_t hs = 0;
268 fin.read((char*)&hs, sizeof(size_t));
269 if (!fin)
270 throw runtime_error("Could not access header size.");
271
272 // copy the header from the input to the output file
273 // and prefix the output file as a compressed fits file
274 string header;
275 header.resize(hs);
276
277 fin.read((char*)header.c_str(), header.size());
278 fout.write((char*)header.c_str(), header.size());
279 if (!fin)
280 throw runtime_error("Could not read full header");
281
282 string templ("tmpXXXXXX");
283 int fd = mkstemp((char*)templ.c_str());
284 const ssize_t rc = write(fd, header.c_str(), header.size());
285 close(fd);
286
287 if (rc<0)
288 throw runtime_error("Could not write to temporary file: "+string(strerror(errno)));
289
290 // open the output file to get the header parsed
291 fits info(templ);
292
293 remove(templ.c_str());
294
295 // get the maximum size of one row and a list
296 // of all columns ordered by their offset
297 size_t row_tot = 0;
298 const fits::Table::Columns &cols = info.GetColumns();
299 map<size_t, fits::Table::Column> columns;
300 for (auto it=cols.begin(); it!=cols.end(); it++)
301 {
302 columns[it->second.offset] = it->second;
303 row_tot += it->second.num*it->second.size;
304 }
305
306 // very simple timer
307 double sec = 0;
308 double frac = 0;
309
310 size_t com = 2+hs+2*sizeof(size_t);
311 size_t tot = hs;
312
313 const size_t masklen = cols.size()/8+1;
314
315 // loop over all rows
316 vector<char> buf(row_tot+masklen);
317 vector<char> swap(row_tot);
318 uint32_t offset = 0;
319
320 const uint64_t nrows = info.GetUInt("NAXIS2");
321
322 Time start;
323
324 cout << fixed;
325 for (uint32_t irow=0; irow<nrows; irow++)
326 {
327 fin.read(buf.data()+offset, buf.size()-offset);
328
329 const uint8_t *mask = reinterpret_cast<uint8_t*>(buf.data());
330 offset = masklen;
331
332 char *ptr = swap.data();
333
334 uint32_t icol = 0;
335 for (auto it=columns.begin(); it!= columns.end(); it++, icol++)
336 {
337 const size_t &num = it->second.num;
338 const size_t &size = it->second.size;
339
340 if (mask[icol/8]&(1<<(icol%8)))
341 {
342 Time now;
343
344 vector<uint16_t> out(num*size/2);
345 int len = Huffman::Decode((uint8_t*)buf.data()+offset, buf.size()-offset, out);
346 if (len<0)
347 throw runtime_error("Decoding failed.");
348
349 sec += Time().UnixTime()-now.UnixTime();
350
351 offset += len;
352
353 revcpy<2>(ptr, (char*)out.data(), num);
354 }
355 else
356 {
357 switch (size)
358 {
359 case 1: memcpy (ptr, buf.data()+offset, num*size); break;
360 case 2: revcpy<2>(ptr, buf.data()+offset, num); break;
361 case 4: revcpy<4>(ptr, buf.data()+offset, num); break;
362 case 8: revcpy<8>(ptr, buf.data()+offset, num); break;
363 }
364
365 offset += num*size;
366 }
367
368 ptr += num*size;
369 }
370
371 com += offset+masklen;
372 tot += row_tot;
373
374 fout.write((char*)swap.data(), swap.size());
375
376 memmove(buf.data(), buf.data()+offset, buf.size()-offset);
377 offset = buf.size()-offset;
378
379 if (!fout)
380 throw runtime_error("Error writing to output file");
381
382 const float proc = float(irow)/nrows;
383 if (proc>frac)
384 {
385 const double elep = Time().UnixTime()-start.UnixTime();
386 cout << "\r" << setprecision(0) << setw(3) << 100*proc << "% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s out:" << tot/1000000/elep << "MB/s" << flush;
387 frac += 0.01;
388 }
389 }
390
391 const double elep = Time().UnixTime()-start.UnixTime();
392 cout << setprecision(0) << "\r100% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s out:" << tot/1000000/elep << "MB/s" << endl;
393
394 return 0;
395}
396
397int main(int argc, const char **argv)
398{
399 Configuration conf(argv[0]);
400 conf.SetPrintUsage(PrintUsage);
401 SetupConfiguration(conf);
402
403 if (!conf.DoParse(argc, argv))
404 return 127;
405
406 const bool decomp = conf.Get<bool>("decompress");
407
408 const string ifile = conf.Get<string>("in");
409 const string ofile = conf.Has("out") ? conf.Get<string>("out") : ReplaceExt(ifile, decomp);
410
411 return decomp ? Decompress(ifile, ofile) : Compress(ifile, ofile);
412
413 /*
414 // reading and writing files which just contain the binary data
415 // For simplicity I assume ROI=300
416
417 ifstream finx( "20130117_082.fits.pu");
418 ofstream foutx("20130117_082.fits.puz");
419
420 while (1)
421 {
422 string str;
423 str.resize(432000*2);
424
425 finx.read((char*)str.c_str(), 432000*2);
426 if (!finx)
427 break;
428
429 // Preprocess the data, e.g. subtract median pixelwise
430 for (int i=0; i<1440; i++)
431 {
432 int16_t *chunk = (int16_t*)str.c_str()+i*300;
433 sort(chunk, chunk+300);
434
435 int16_t med = chunk[149];
436
437 for (int j=0; j<300; j++)
438 chunk[j] -= med;
439 }
440
441 // do huffman encoding on shorts
442 string buf;
443 int len = huffmans_encode(buf, (uint16_t*)str.c_str(), 432000);
444
445 // if the result is smaller than the original data write
446 // the result, otherwise the original data
447 if (buf.size()<432000*2)
448 foutx.write(buf.c_str(), buf.size());
449 else
450 foutx.write(str.c_str(), 432000);
451 }
452
453 return 0;
454 */
455}
Note: See TracBrowser for help on using the repository browser.