#ifndef _SKETCH #define _SKETCH #include "xis.h" using namespace std; /* Generic interface for the sketches estimating size of joins and self-join sizes For details on all the sketches implemented here, see the paper: 1) "Statistical Analysis of Sketch Estimators" by F. Rusu and A. Dobra */ class Sketch { public: double Average(double *x, int n); double Median(double *x, int n); double Min(double *x, int n); virtual ~Sketch(); //reseting the sketch structure virtual void Clear_Sketch() = 0; //updating the sketch with the value corresponding to the given key virtual void Update_Sketch(unsigned int key, double func) = 0; //estimating the size of join of two sketches; the second sketch is passed in as s1 virtual double Size_Of_Join(Sketch *s1) = 0; //estimating the self-join size or the second frequency moment virtual double Self_Join_Size() = 0; }; /* The original AGMS sketches proposed in the paper: 1) "Tracking join and self-join sizes in limited storage" by N. Alon et al. An array of rows_no * cols_no basic sketch estimators is used to compute the improved estimator. cols_no estimators from the same row are averaged, then the median of the rows_no average estimators is returned as the final estimate. sketch_elem is the array of basic estimators. xi_pm1 is an array of pseudo-random variables used to compute the basic sketch estimators. A pseudo-random variable is attached to each basic sketch estimator. */ class AGMS_Sketch : public Sketch { protected: unsigned int rows_no; unsigned int cols_no; double *sketch_elem; Xi **xi_pm1; public: AGMS_Sketch(unsigned int cols_no, unsigned int rows_no, Xi **xi_pm1); virtual ~AGMS_Sketch(); virtual void Clear_Sketch(); virtual void Update_Sketch(unsigned int key, double func); virtual double Size_Of_Join(Sketch *s1); virtual double Self_Join_Size(); }; /* Fast-AGMS sketches proposed in the paper: 1) "Sketching streams through the net: distributed approximate query tracking" by G. Cormode and M. Garofalakis rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the median of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is initially hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the +/-1 random variable generated for the same key by xi_pm1 corresponding to the same row. */ class FAGMS_Sketch : public Sketch { protected: unsigned int buckets_no; unsigned int rows_no; double *sketch_elem; Xi **xi_bucket; Xi **xi_pm1; public: FAGMS_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket, Xi **xi_pm1); virtual ~FAGMS_Sketch(); virtual void Clear_Sketch(); virtual void Update_Sketch(unsigned int key, double func); virtual double Size_Of_Join(Sketch *s1); virtual double Self_Join_Size(); }; /* Fast-Count sketches proposed in the paper: 1) "Tabulation-based 4-universal hashing with applications to second moment estimation" by M. Thorup and Y. Zhang rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the average of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the value corresponding to the key. */ class Fast_Count_Sketch : public Sketch { protected: unsigned int buckets_no; unsigned int rows_no; double *sketch_elem; Xi **xi_bucket; public: Fast_Count_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket); virtual ~Fast_Count_Sketch(); virtual void Clear_Sketch(); virtual void Update_Sketch(unsigned int key, double func); virtual double Size_Of_Join(Sketch *s1); virtual double Self_Join_Size(); }; /* Count-Min sketches proposed in the paper: 1) "An improved data stream summary: the count-min sketch and its applications" by G. Cormode and S. Muthukrishnan rows_no basic estimators are computed by hashing the key values to an array with buckets_no elements. Then the minimum of the rows_no estimators is returned as the final estimator. sketch_elem is an array of rows_no * buckets_no counters. A key is hashed using the xi_bucket pseudo-random variable corresponding to the actual row, then the counter in that bucket is updated with the value corresponding to the key. */ class Count_Min_Sketch : public Sketch { protected: unsigned int buckets_no; unsigned int rows_no; double *sketch_elem; Xi **xi_bucket; public: Count_Min_Sketch(unsigned int buckets_no, unsigned int rows_no, Xi **xi_bucket); virtual ~Count_Min_Sketch(); virtual void Clear_Sketch(); virtual void Update_Sketch(unsigned int key, double func); virtual double Size_Of_Join(Sketch *s1); virtual double Self_Join_Size(); }; #endif