#ifndef _SKETCH
#define _SKETCH
#include "xis.h"
using namespace std;
/*
Generic interface for the sketches estimating size of joins and self-join sizes
For details on all the sketches implemented here, see the paper:
1) "Statistical Analysis of Sketch Estimators" by F. Rusu and A. Dobra
*/
class Sketch
{
public:
double Average(double *x, int n);
double Median(double *x, int n);
double Min(double *x, int n);
virtual ~Sketch();
//reseting the sketch structure
virtual void Clear_Sketch() = 0;
//updating the sketch with the value corresponding to the given key
virtual void Update_Sketch(unsigned int key, double func) = 0;
//estimating the size of join of two sketches; the second sketch is passed in as s1
virtual double Size_Of_Join(Sketch *s1) = 0;
//estimating the self-join size or the second frequency moment
virtual double Self_Join_Size() = 0;
};
/*
The original AGMS sketches proposed in the paper:
1) "Tracking join and self-join sizes in limited storage" by N. Alon et al.
An array of rows_no * cols_no basic sketch estimators is used to compute the improved estimator. cols_no estimators from the same row are averaged, then the median of the rows_no average estimators is returned as the final estimate. sketch_elem is the array of basic estimators. xi_pm1 is an array of pseudo-random variables used to compute the basic sketch estimators. A pseudo-random variable is attached to each basic sketch estimator.
*/
class AGMS_Sketch : public Sketch
{
protected:
unsigned int rows_no;
unsigned int cols_no;
double *sketch_elem;
Xi **xi_pm1;
public:
AGMS_Sketch(unsigned int cols_no, unsigned int rows_no, Xi **xi_pm1);
virtual ~AGMS_Sketch();
virtual void Clear_Sketch();
virtual void Update_Sketch(unsigned int key, double func);
virtual double Size_Of_Join(Sketch *s1);
virtual double Self_Join_Size();
};
/*
Fast-AGMS sketches proposed in the paper:
1) "Sketching streams through the net: distributed approximate query tracking" by G. Cormode and M. Garofalakis
rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the median of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is initially hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the +/-1 random variable generated for the same key by xi_pm1 corresponding to the same row.
*/
class FAGMS_Sketch : public Sketch
{
protected:
unsigned int buckets_no;
unsigned int rows_no;
double *sketch_elem;
Xi **xi_bucket;
Xi **xi_pm1;
public:
FAGMS_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket, Xi **xi_pm1);
virtual ~FAGMS_Sketch();
virtual void Clear_Sketch();
virtual void Update_Sketch(unsigned int key, double func);
virtual double Size_Of_Join(Sketch *s1);
virtual double Self_Join_Size();
};
/*
Fast-Count sketches proposed in the paper:
1) "Tabulation-based 4-universal hashing with applications to second moment estimation" by M. Thorup and Y. Zhang
rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the average of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the value corresponding to the key.
*/
class Fast_Count_Sketch : public Sketch
{
protected:
unsigned int buckets_no;
unsigned int rows_no;
double *sketch_elem;
Xi **xi_bucket;
public:
Fast_Count_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket);
virtual ~Fast_Count_Sketch();
virtual void Clear_Sketch();
virtual void Update_Sketch(unsigned int key, double func);
virtual double Size_Of_Join(Sketch *s1);
virtual double Self_Join_Size();
};
/*
Count-Min sketches proposed in the paper:
1) "An improved data stream summary: the count-min sketch and its applications" by G. Cormode and S. Muthukrishnan
rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the minimum of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the value corresponding to the key.
*/
class Count_Min_Sketch : public Sketch
{
protected:
unsigned int buckets_no;
unsigned int rows_no;
double *sketch_elem;
Xi **xi_bucket;
public:
Count_Min_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket);
virtual ~Count_Min_Sketch();
virtual void Clear_Sketch();
virtual void Update_Sketch(unsigned int key, double func);
virtual double Size_Of_Join(Sketch *s1);
virtual double Self_Join_Size();
};
#endif