JODA  0.13.1 (59b41972)
JSON On-Demand Analysis
SimilarityScheduler.h
Go to the documentation of this file.
1 //
2 // Created by Nico on 13/05/2019.
3 //
4 
5 #ifndef JODA_SIMILARITYSCHEDULER_H
6 #define JODA_SIMILARITYSCHEDULER_H
7 
8 #include <joda/config/config.h>
12 #include <rapidjson/istreamwrapper.h>
14 
20 template <typename SIM>
22  public:
23  typedef size_t ContainerIdentifier;
24 
32  size_t contSize = 0);
33 
41  ContainerIdentifier getContainerForDoc(std::string &raw);
42 
51 
59  ContainerIdentifier getContainerForDoc(rapidjson::IStreamWrapper &stream);
60 
67  std::unique_ptr<RJDocument> getNewDoc(ContainerIdentifier id);
68 
77  std::unique_ptr<RJDocument> &&doc,
78  std::unique_ptr<IOrigin> &&origin, size_t size);
79  virtual ~SimilarityScheduler() = default;
80 
85  void finalize();
86 
87  private:
88  typedef std::pair<std::unique_ptr<JSONContainer>,
90  sCont;
91  std::vector<sCont> container;
93  size_t contSize;
94  std::unique_ptr<JSONContainer> createContainer(size_t contSize) const;
95 };
96 
97 template <typename SIM>
98 std::unique_ptr<JSONContainer> SimilarityScheduler<SIM>::createContainer(
99  size_t contSize) const {
100  if (contSize == 0)
101  return std::make_unique<JSONContainer>(); //& Create new one
102  else
103  return std::make_unique<JSONContainer>(contSize); //& Create new one
104 }
105 
106 template <typename SIM>
107 std::unique_ptr<RJDocument> SimilarityScheduler<SIM>::getNewDoc(
108  ContainerIdentifier id) {
109  return std::make_unique<RJDocument>(container[id].first->getAlloc());
110 }
111 
112 template <typename SIM>
114  ContainerIdentifier id, std::unique_ptr<RJDocument> &&doc,
115  std::unique_ptr<IOrigin> &&origin, size_t size) {
116  container[id].first->insertDoc(std::move(doc), std::move(origin));
117  if (!(container[id].first->hasSpace(0) || container[id].first->size() == 0)) {
118  container[id].first->finalize();
119  DCHECK(container[id].first != nullptr);
120  queue->send(std::move(container[id].first)); // Enqueue
121  DCHECK(container[id].first == nullptr);
122  container[id].first = createContainer(contSize);
123  DCHECK(container[id].first != nullptr);
124  }
125 }
126 
127 template <typename SIM>
129  JsonContainerQueue::queue_t *queue, size_t contSize)
130  : queue(queue), contSize(contSize) {}
131 
132 template <typename SIM>
135  SIM measure;
137  DCHECK(simRep.is_implemented)
138  << "Function called with wrong Similarity measure";
139  auto docRep = simRep.getRepresentation(raw);
140 
141  if (!container.empty()) {
142  double maxSim = 0;
143  int maxSimi = -1;
144  for (size_t i = 0; i < container.size(); ++i) {
145  double sim = measure.measure(docRep, container[i].second);
146  if (sim > maxSim) {
147  maxSim = sim;
148  maxSimi = i;
149  }
150  }
151  if (maxSim >= config::sim_min_similarity) {
152  return maxSimi;
153  }
154  }
155  container.emplace_back(createContainer(contSize), std::move(docRep));
156  return container.size() - 1;
157 }
158 
159 template <typename SIM>
162  SIM measure;
164  DCHECK(simRep.is_implemented)
165  << "Function called with wrong Similarity measure";
166  auto docRep = simRep.getRepresentation(doc);
167 
168  if (!container.empty()) {
169  double maxSim = 0;
170  int maxSimi = -1;
171  for (size_t i = 0; i < container.size(); ++i) {
172  double sim = measure.measure(docRep, container[i].second);
173  if (sim > maxSim) {
174  maxSim = sim;
175  maxSimi = i;
176  }
177  }
178  if (maxSim >= config::sim_min_similarity) {
179  return maxSimi;
180  }
181  }
182  container.emplace_back(createContainer(contSize), std::move(docRep));
183  return container.size() - 1;
184 }
185 
186 template <typename SIM>
189  rapidjson::IStreamWrapper &stream) {
190  SIM measure;
192  DCHECK(simRep.is_implemented)
193  << "Function called with wrong Similarity measure";
194  auto docRep = simRep.getRepresentation(stream);
195 
196  if (!container.empty()) {
197  double maxSim = 0;
198  int maxSimi = -1;
199  for (auto i = 0; i < container.size(); ++i) {
200  double sim = measure.measure(docRep, container[i].second);
201  if (sim > maxSim) {
202  maxSim = sim;
203  maxSimi = i;
204  }
205  }
206  if (maxSim >= config::sim_min_similarity) {
207  return maxSimi;
208  }
209  }
210  container.emplace_back(createContainer(contSize), std::move(docRep));
211  return container.size() - 1;
212 }
213 
214 template <typename SIM>
216  // TODO Merge small container
217 
218  for (auto &&currentSimContainer : container) {
219  auto &currentContainer = currentSimContainer.first;
220  if (currentContainer->size() > 0) {
221  DCHECK(currentContainer != nullptr);
222  currentContainer->finalize();
223  if (!config::storeJson) currentContainer->removeDocuments();
224  queue->send(std::move(currentContainer)); // Enqueue
225  DCHECK(currentContainer == nullptr);
226  }
227  }
228  queue->producerFinished();
229 }
230 
231 #endif // JODA_SIMILARITYSCHEDULER_H
rapidjson::GenericDocument< RJChar, RJMemoryPoolAlloc, RJBaseAlloc > RJDocument
Definition: RJFwd.h:28
Definition: SimilarityScheduler.h:21
void scheduleDocument(ContainerIdentifier id, std::unique_ptr< RJDocument > &&doc, std::unique_ptr< IOrigin > &&origin, size_t size)
Definition: SimilarityScheduler.h:113
virtual ~SimilarityScheduler()=default
ContainerIdentifier getContainerForDoc(std::string &raw)
Definition: SimilarityScheduler.h:134
std::unique_ptr< RJDocument > getNewDoc(ContainerIdentifier id)
Definition: SimilarityScheduler.h:107
void finalize()
Definition: SimilarityScheduler.h:215
size_t ContainerIdentifier
Definition: SimilarityScheduler.h:23
SimilarityScheduler(JsonContainerQueue::queue_t *queue, size_t contSize=0)
Definition: SimilarityScheduler.h:128
static bool storeJson
Definition: config.h:32
static double sim_min_similarity
Definition: config.h:67
Definition: Queue.h:19
Definition: IJSONSimilarityMeasure.h:36
Representation getRepresentation(const RJDocument &lhs)
Definition: IJSONSimilarityMeasure.h:46
void * Representation
Definition: IJSONSimilarityMeasure.h:38
bool is_implemented
Definition: IJSONSimilarityMeasure.h:56