JODA  0.13.1 (59b41972)
JSON On-Demand Analysis
JSONContainer.h
Go to the documentation of this file.
1 //
2 // Created by Nico Schäfer on 9/7/17.
3 //
4 
5 #ifndef JODA_JSONCONTAINER_H
6 #define JODA_JSONCONTAINER_H
7 
12 #include <limits.h>
13 
14 #include <cstdio>
15 #include <functional>
16 #include <unordered_set>
17 #include <vector>
18 
19 #include <joda/config/config.h>
25 #include <limits.h>
26 #include <functional>
27 #include <unordered_set>
28 #include "../../../../document/src/DocumentCostHandler.h"
30 
31 typedef std::vector<bool> DocIndex;
32 
38  public:
43  JSONContainer();
49  explicit JSONContainer(size_t maxsize);
50  JSONContainer(JSONContainer &other) = delete;
51 
52  virtual ~JSONContainer();
53  void preparePurge();
54 
61  bool hasSpace(size_t size) const;
62 
70  bool hasMetaSpace(size_t size);
71 
76  unsigned long size() const;
77 
82  size_t getMaxSize() const;
83 
91  bool probContainsAttr(const std::string &attr) const;
92 
93  // Allocator
99 
100  // Documents
106  void insertDoc(std::unique_ptr<RJDocument> &&doc,
107  std::unique_ptr<IOrigin> &&origin, size_t baseIndex = 0);
108 
114  void insertDoc(RapidJsonDocument &&doc, size_t baseIndex = 0);
115 
116  // Document Access
123  std::unique_ptr<const DocIndex> checkDocuments(
124  std::function<bool(RapidJsonDocument &, size_t)> &func);
125 
132  template <class retType>
133  void forAll(std::function<retType(RapidJsonDocument &)> &func,
134  std::vector<retType> &vec);
135 
141  template <class F>
142  void forAll(F &f) {
143  ScopedRef ref(this);
144  for (auto &&doc : docs) {
145  if (doc.isValid()) {
146  f(doc);
147  }
148  }
149  setLastUsed();
150  }
151 
157  template <class F>
158  void forAll(F f) {
159  ScopedRef ref(this);
160  for (auto &&doc : docs) {
161  if (doc.isValid()) {
162  f(doc);
163  }
164  }
165  setLastUsed();
166  }
167 
175  template <class F>
176  void forAll(F f, const DocIndex &ids) {
177  ScopedRef useCont(this, false);
178  reparseSubset(ids);
179 
180  for (size_t i = 0; i < docs.size(); ++i) {
181  if (!ids[i]) continue;
182  auto &doc = docs[i];
183  if (doc.isValid()) {
184  f(doc);
185  }
186  }
187  setLastUsed();
188  }
189 
197  template <class F, class R>
198  std::vector<R> forAllRet(F f, const DocIndex &ids) {
199  auto ret = std::vector<R>();
200  ret.reserve(ids.size());
201 
202  ScopedRef useCont(this, false);
203  reparseSubset(ids);
204 
205  for (size_t i = 0; i < docs.size(); ++i) {
206  if (!ids[i]) {
207  ret.emplace_back();
208  continue;
209  };
210  auto &doc = docs[i];
211  if (doc.isValid()) {
212  ret.emplace_back(f(doc,i));
213  }
214  }
215  setLastUsed();
216  return ret;
217  }
218 
229  std::vector<std::unique_ptr<RJDocument>> projectDocuments(
230  const DocIndex &ids,
231  const std::vector<std::unique_ptr<joda::query::IProjector>> &proj,
232  RJMemoryPoolAlloc &alloc,
233  const std::vector<std::unique_ptr<joda::query::ISetProjector>> &setProj);
234 
245  std::unique_ptr<JSONContainer> createViewFromContainer(
246  const DocIndex &ids,
247  const std::vector<std::unique_ptr<joda::query::IProjector>> &proj,
248  const std::vector<std::unique_ptr<joda::query::ISetProjector>> &setProj);
249 
251  const DocIndex &ids,
252  const std::vector<std::unique_ptr<joda::query::IProjector>> &proj,
253  const std::vector<std::unique_ptr<joda::query::ISetProjector>> &setProj);
254 
256  const DocIndex &ids,
257  const std::vector<std::unique_ptr<joda::query::IProjector>> &proj,
258  const std::vector<std::unique_ptr<joda::query::ISetProjector>> &setProj);
264  void finalize();
265 
271  void metaFinalize();
272 
278  void removeDocuments();
279 
283  void reparse();
284 
288  void setViews();
289  void removeViews();
290 
296  void reparseSubset(unsigned long start = 0, unsigned long end = ULONG_MAX);
297 
302  void reparseSubset(const DocIndex &index);
303 
307  bool isReparsable();
308 
313  size_t estimatedSize() const;
314 
319  size_t parsedSize() const;
320 
321  /*
322  * Stringify
323  */
330  std::vector<std::string> stringify(unsigned long start = 0,
331  unsigned long end = ULONG_MAX);
332 
339  std::vector<std::unique_ptr<RJDocument>> getRaw(
340  unsigned long start = 0, unsigned long end = ULONG_MAX);
346  std::vector<std::unique_ptr<RJDocument>> getRaw(const DocIndex &ids);
347  std::vector<std::unique_ptr<RJDocument>> getRaw(const DocIndex &ids,
348  RJMemoryPoolAlloc &alloc);
349 
350  template <class Handler>
351  std::vector<bool> AcceptDocuments(Handler &handler, unsigned long start = 0,
352  unsigned long end = ULONG_MAX) {
353  std::vector<bool> ret;
354  if (docs.empty()) return ret;
355  ScopedRef useCont(this, false);
356  reparseSubset(start, end);
357  end = std::min(end, docs.size() - 1);
358  if (start > end) return ret;
359  for (unsigned long i = start; i <= end; ++i) {
360  auto &doc = docs[i];
361  if (doc.isValid()) {
362  if (isView()) {
363  auto &view = doc.getView();
364  ret.push_back(view->Accept(handler));
365  } else {
366  ret.push_back(doc.getJson()->Accept(handler));
367  }
368  } else
369  ret.push_back(false);
370  }
371  return ret;
372  }
373 
379  void writeFile(const std::string &file, bool append);
380 
381  /*
382  * Indices
383  */
388  const std::unique_ptr<QueryCache> &getCache() const;
393  std::unique_ptr<const DocIndex> getAllIDs() const;
394 
399  void materializeAttributes(const std::vector<std::string> &atts);
405  void materializeAttributesIfRequired(const std::vector<std::string> &atts);
410  void materializeView();
411 
416  bool isView() const;
417 
422  unsigned long getLastUsed() const;
423 
424  bool isBaseContainer(const JSONContainer *cont) const;
425 
426  /*
427  * Ref Counting
428  */
429  class ScopedRef {
430  public:
431  ScopedRef(JSONContainer *cont, bool parsed = true)
432  : engaged_(true), cont_(cont) {
433  cont_->useCont(parsed);
434  }
435 
437  if (engaged_) {
438  release();
439  }
440  }
441 
442  void release() {
443  engaged_ = false;
444  cont_->unUseCont();
445  }
446 
447  private:
448  bool engaged_;
449  JSONContainer *cont_ = nullptr;
450  };
451 
452  inline auto useContInScope(bool parse = true) { return ScopedRef(this, parse); }
453 
454  private:
455  size_t lastParsedSize = 0;
456  /*
457  * Ref Counting
458  */
459  std::atomic<unsigned int> usage{0};
460 
461  inline void useCont(bool parse = true) {
462  if (isView()) baseContainer->useCont(parse);
463  auto prev = usage.fetch_add(1);
464  if (parse) reparse();
465  if (prev == 0) setViews();
466  }
467 
468  inline void unUseCont() {
469  if (isView()) baseContainer->unUseCont();
470  auto prev = usage.fetch_sub(1);
471  if (prev == 1 && !config::storeJson) removeDocuments();
472  }
473 
474  // Indices
475  DOC_ID minID = std::numeric_limits<DOC_ID>::max();
476  DOC_ID maxID = 0;
477  bloom_filter attr_bloom;
478  bool bloomCalculated = false;
479  bool viewsComputed = false;
480  std::unique_ptr<QueryCache> cache;
481 
482  void calculateBloom();
483  void recursiveBloomAttrSearch(const RJValue &obj,
484  const std::string &attr = "");
485 
486  bool serializeMissing();
487  FILEID id = 0;
488 
492  std::vector<std::string> materializedAttributes;
493  JSONContainer *baseContainer = nullptr;
494  std::vector<JSONContainer *> subContainers;
495  std::unique_ptr<ViewStructure> viewStruc;
496 
497  void addSubContainer(JSONContainer *cont);
498  void removeSubContainer(JSONContainer *cont);
499 
505  void insertViewDoc(std::unique_ptr<RJDocument> &&doc, size_t baseIndex);
506 
507  // Size
508  size_t maxSize;
509  size_t theoreticalSize = 0;
510 
511  // Storage
512  bool final = false;
513  unsigned long lastUsed = 0;
514  bool deleted = false;
515 
516  void setLastUsed();
517 
518 
519  static bool compareDocContainer(const RapidJsonDocument &i,
520  const RapidJsonDocument &j) {
521  return i.getOrigin()->operator<(*j.getOrigin());
522  }
523 
524  std::vector<RapidJsonDocument> docs;
525  std::vector<size_t> baseIds;
526 
527  RJMemoryPoolPointer alloc;
528 };
529 
530 template <class retType>
531 void JSONContainer::forAll(std::function<retType(RapidJsonDocument &)> &func,
532  std::vector<retType> &vec) {
533  ScopedRef ref(this);
534  for (auto &&doc : docs) {
535  if (doc.isValid()) {
536  vec.push_back(std::move(func(doc)));
537  }
538  }
539  setLastUsed();
540 }
541 
542 #endif // JODA_JSONCONTAINER_H
unsigned long FILEID
Definition: FileNameRepo.h:12
std::vector< bool > DocIndex
Definition: JSONContainer.h:31
rapidjson::MemoryPoolAllocator< RJBaseAlloc > RJMemoryPoolAlloc
Definition: RJFwd.h:26
rapidjson::GenericValue< RJChar, RJMemoryPoolAlloc > RJValue
Definition: RJFwd.h:29
std::unique_ptr< RJMemoryPoolAlloc > RJMemoryPoolPointer
Definition: RapidJsonDocument.h:143
unsigned long DOC_ID
Definition: RapidJsonDocument.h:16
Definition: DocumentCostHandler.h:14
Definition: JSONContainer.h:429
~ScopedRef()
Definition: JSONContainer.h:436
ScopedRef(JSONContainer *cont, bool parsed=true)
Definition: JSONContainer.h:431
void release()
Definition: JSONContainer.h:442
Definition: JSONContainer.h:37
JSONContainer(JSONContainer &other)=delete
JSONContainer()
Definition: JSONContainer.cpp:23
void forAll(std::function< retType(RapidJsonDocument &)> &func, std::vector< retType > &vec)
Definition: JSONContainer.h:531
bool hasSpace(size_t size) const
Definition: JSONContainer.cpp:44
void materializeView()
Definition: JSONContainer.cpp:1096
bool probContainsAttr(const std::string &attr) const
Definition: JSONContainer.cpp:147
size_t getMaxSize() const
Definition: JSONContainer.cpp:920
void materializeAttributes(const std::vector< std::string > &atts)
Definition: JSONContainer.cpp:1045
virtual ~JSONContainer()
Definition: JSONContainer.cpp:981
std::unique_ptr< const DocIndex > checkDocuments(std::function< bool(RapidJsonDocument &, size_t)> &func)
Definition: JSONContainer.cpp:86
size_t estimatedSize() const
Definition: JSONContainer.cpp:770
size_t parsedSize() const
Definition: JSONContainer.cpp:1227
bool isBaseContainer(const JSONContainer *cont) const
Definition: JSONContainer.cpp:1223
void reparse()
Definition: JSONContainer.cpp:186
const std::unique_ptr< QueryCache > & getCache() const
Definition: JSONContainer.cpp:392
DocumentCostHandler createTempViewDocs(const DocIndex &ids, const std::vector< std::unique_ptr< joda::query::IProjector >> &proj, const std::vector< std::unique_ptr< joda::query::ISetProjector >> &setProj)
Definition: JSONContainer.cpp:444
void forAll(F f, const DocIndex &ids)
Definition: JSONContainer.h:176
void forAll(F &f)
Definition: JSONContainer.h:142
void reparseSubset(unsigned long start=0, unsigned long end=ULONG_MAX)
Definition: JSONContainer.cpp:252
void removeViews()
Definition: JSONContainer.cpp:1163
bool useViewBasedOnSample(const DocIndex &ids, const std::vector< std::unique_ptr< joda::query::IProjector >> &proj, const std::vector< std::unique_ptr< joda::query::ISetProjector >> &setProj)
Definition: JSONContainer.cpp:1179
void preparePurge()
Definition: JSONContainer.cpp:991
void insertDoc(std::unique_ptr< RJDocument > &&doc, std::unique_ptr< IOrigin > &&origin, size_t baseIndex=0)
Definition: JSONContainer.cpp:931
RJMemoryPoolAlloc * getAlloc()
Definition: JSONContainer.cpp:51
void setViews()
Definition: JSONContainer.cpp:1129
bool isView() const
Definition: JSONContainer.cpp:1092
void materializeAttributesIfRequired(const std::vector< std::string > &atts)
Definition: JSONContainer.cpp:996
unsigned long getLastUsed() const
Definition: JSONContainer.cpp:1094
void forAll(F f)
Definition: JSONContainer.h:158
bool hasMetaSpace(size_t size)
Definition: JSONContainer.cpp:922
void finalize()
Definition: JSONContainer.cpp:70
std::unique_ptr< JSONContainer > createViewFromContainer(const DocIndex &ids, const std::vector< std::unique_ptr< joda::query::IProjector >> &proj, const std::vector< std::unique_ptr< joda::query::ISetProjector >> &setProj)
Definition: JSONContainer.cpp:606
std::vector< bool > AcceptDocuments(Handler &handler, unsigned long start=0, unsigned long end=ULONG_MAX)
Definition: JSONContainer.h:351
auto useContInScope(bool parse=true)
Definition: JSONContainer.h:452
void metaFinalize()
Definition: JSONContainer.cpp:79
bool isReparsable()
Definition: JSONContainer.cpp:160
std::unique_ptr< const DocIndex > getAllIDs() const
Definition: JSONContainer.cpp:824
std::vector< std::unique_ptr< RJDocument > > getRaw(unsigned long start=0, unsigned long end=ULONG_MAX)
Definition: JSONContainer.cpp:828
void removeDocuments()
Definition: JSONContainer.cpp:169
std::vector< std::unique_ptr< RJDocument > > projectDocuments(const DocIndex &ids, const std::vector< std::unique_ptr< joda::query::IProjector >> &proj, RJMemoryPoolAlloc &alloc, const std::vector< std::unique_ptr< joda::query::ISetProjector >> &setProj)
Definition: JSONContainer.cpp:396
std::vector< std::string > stringify(unsigned long start=0, unsigned long end=ULONG_MAX)
Definition: JSONContainer.cpp:781
std::vector< R > forAllRet(F f, const DocIndex &ids)
Definition: JSONContainer.h:198
void writeFile(const std::string &file, bool append)
Definition: JSONContainer.cpp:817
unsigned long size() const
Definition: JSONContainer.cpp:68
Definition: RapidJsonDocument.h:22
const std::unique_ptr< const IOrigin > & getOrigin() const
Definition: RapidJsonDocument.cpp:25
Definition: bloom_filter.hpp:155
static bool storeJson
Definition: config.h:32