From bd918e47f04c05bb7206fefe8d0a12555e196124 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Sun, 31 Mar 2024 09:43:01 -0400 Subject: [PATCH] cosmetic changes to hello world --- examples/hello_world.cc | 164 +++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 94 deletions(-) diff --git a/examples/hello_world.cc b/examples/hello_world.cc index e2a0991d..9681bd4e 100644 --- a/examples/hello_world.cc +++ b/examples/hello_world.cc @@ -2,27 +2,21 @@ Copyright (c) 2019-2026, Hossein Moein All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. -* Neither the name of Hossein Moein and/or the DataFrame nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following +conditions are met: +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided with the distribution. +* Neither the name of Hossein Moein and/or the DataFrame nor the names of its contributors may be used to endorse or promote + products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, +BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL Hossein Moein BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. */ #include // Main DataFrame header @@ -69,14 +63,13 @@ struct MyData { // int main(int, char *[]) { - // If you want to fully take advantage of DataFrame parallel computing logic, - // it is recommended to call the following at the beginning of your program. + // If you want to fully take advantage of DataFrame parallel computing logic, it is recommended to call the following + // at the beginning of your program. // - // NOTE: make sure you read and understand the Multithreading section - // in the documentations (threads could potentially hinder performance). - // This program (hello world) is a perfect example. Since I know this program doesn’t - // deal with large datasets to trigger multithreaded algorithms, populating the thread-pool - // with threads (i.e. calling set_optimum_thread_level()) is a waste of resources. + // NOTE: make sure you read and understand the Multithreading section in the documentations (threads could + // potentially hinder performance). This program (hello world) is a perfect example. Since I know this program + // doesn’t deal with large datasets to trigger multithreaded algorithms, populating the thread-pool with threads + // (i.e. calling set_optimum_thread_level()) is a waste of resources. // ThreadGranularity::set_optimum_thread_level(); @@ -87,12 +80,11 @@ int main(int, char *[]) { ULDataFrame ul_df1; - // One way to load data into the DataFrame is one column at a time. - // A DataFrame column could be at most as long as its index column. So, you must load the indexfirst before - // loading any column. + // One way to load data into the DataFrame is one column at a time. A DataFrame column could be at most as long as its + // index column. So, you must load the indexfirst before loading any column. // - // Once you load a column or index, the data is moved to DataFrame. The original vectors are now empty. - // There are other ways of loading data without the move. + // Once you load a column or index, the data is moved to DataFrame. The original vectors are now empty. There are other + // ways of loading data without the move. // ul_df1.load_index(std::move(idx_col1)); ul_df1.load_column("dbl_col", std::move(dbl_col1)); @@ -107,8 +99,7 @@ int main(int, char *[]) { ULDataFrame ul_df2; - // Also, you can load data into a DataFrame all at once. In this case again the data is moved to the - // DataFrame. + // Also, you can load data into a DataFrame all at once. In this case again the data is moved to the DataFrame. // ul_df2.load_data(std::move(idx_col2), std::make_pair("string col", str_col), @@ -117,18 +108,17 @@ int main(int, char *[]) { StrDataFrame ibm_df; - // Also, you can load data into a DataFrame from a file, supporting a few different formats. - // If the file cannot be found, an exception will be thrown. - // If the DataFrame root directory is your current directory when running this, it should work fine. + // Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found, + // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should + // work fine. // ibm_df.read("data/IBM.csv", io_format::csv2); - // To access a column, you must know its name (or index) and its type. - // In case of a "standard" DataFrame (not a view), the columns are returned as a reference to a - // std::vector of type of that column. + // To access a column, you must know its name (or index) and its type. In case of a "standard" DataFrame (not a view), + // the columns are returned as a reference to a std::vector of type of that column. // - // get_column() involves 1 or sometimes 2 hash-table lookups. - // So, you should not call it repeatedly in a loop. Instead get a reference to it and use the reference. + // get_column() involves 1 or sometimes 2 hash-table lookups. So, you should not call it repeatedly in a loop. Instead + // get a reference to it and use the reference. // const auto &cool_col_ref = ul_df2.get_column("Cool Column"); const auto &str_col_ref = ul_df2.get_column("string col"); @@ -139,27 +129,24 @@ int main(int, char *[]) { std::cout << str << ", "; std::cout << std::endl; - std::cout << "There are " << ibm_df.get_column("IBM_Close").size() - << " IBM close prices" << std::endl; + std::cout << "There are " << ibm_df.get_column("IBM_Close").size() << " IBM close prices" << std::endl; std::cout << "There are " << ibm_df.get_index().size() << " IBM indices" << std::endl; - // You can write the data to a file or stdout in a few formats. You must specify all the column types, - // but only once. When writing to a file, the file name/path must be create-able. + // You can write the data to a file or stdout in a few formats. You must specify all the column types, but only once. + // When writing to a file, the file name/path must be create-able. // ul_df2.write(std::cout, io_format::csv2); ibm_df.write("/tmp/test.json", io_format::json); - // You can convert a DataFrame to a string and from a string back into a DataFrame. - // This could be used to transmit a DataFrame from one place to another or store a DataFrame in - // databases, caches, … + // You can convert a DataFrame to a string and from a string back into a DataFrame. This could be used to transmit a + // DataFrame from one place to another or store a DataFrame in databases, caches, ... // const std::string ibm_df_as_str = ibm_df.to_string(); StrDataFrame ibm_df_2; - // Since we convert from native type to string and back, if you have floating point numbers with - // long precisions, you may run into precision mismatches. - // to_string() has a precision parameter you can adjust. The default is 12 which is a relatively - // high precision. + // Since we convert from native type to string and back, if you have floating point numbers with long precisions, you may + // run into precision mismatches. to_string() has a precision parameter you can adjust. The default is 12 which is a + // relatively high precision. // ibm_df_2.from_string(ibm_df_as_str.c_str()); // std::cout << ibm_df_as_str << std::endl; // Large output @@ -176,8 +163,7 @@ int main(int, char *[]) { // You must specify all the column types, but only once. // auto above_150_fun = [](const std::string &, const double &val)-> bool { return (val > 150.0); }; - auto above_150_df = - ibm_df.get_data_by_sel("IBM_Close", above_150_fun); + auto above_150_df = ibm_df.get_data_by_sel("IBM_Close", above_150_fun); // Or, you could choose to get a view. See docs for views. // @@ -187,48 +173,42 @@ int main(int, char *[]) { // You can get another DataFrame by group-bying on one or multiple columns. // You must specify only the type(s) of column(s), you are group-bying. // - // Group-by column dbl_col, and I am specifying how to summarize the index column and each of the - // other columns. + // Group-by column dbl_col, and I am specifying how to summarize the index column and each of the other columns. // - auto gb_df = - ul_df1.groupby1("dbl_col", - LastVisitor(), - std::make_tuple("integers", "sum_int", SumVisitor()), - std::make_tuple("my_data_col", "last_my_data", LastVisitor())); + auto gb_df = ul_df1.groupby1("dbl_col", + LastVisitor(), + std::make_tuple("integers", "sum_int", SumVisitor()), + std::make_tuple("my_data_col", "last_my_data", LastVisitor())); - // You can run statistical, financial, ML, … algorithms on one or multiple columns by using visitors. - // You must specify the column(s) type(s). - // The visitor's data column is of type double and its index column is of type std::string. + // You can run statistical, financial, ML, … algorithms on one or multiple columns by using visitors. You must specify + // the column(s) type(s). The visitor's data column is of type double and its index column is of type std::string. // StdVisitor stdev_v; ibm_df.visit("IBM_Close", stdev_v); std::cout << "Standard deviation of IBM close prices: " << stdev_v.get_result() << std::endl; - // Now let’s declare two DataFrames with index type of DateTime which is a handy object for - // date/time manipulations. + // Now let’s declare two DataFrames with index type of DateTime which is a handy object for date/time manipulations. // DTDataFrame ibm_dt_df; DTDataFrame aapl_dt_df; - // Let’s read the AAPL and IBM market data from their files. The data for these two stocks start and end - // at different dates. But there is overlapping data between them. + // Let’s read the AAPL and IBM market data from their files. The data for these two stocks start and end at different + // dates. But there is overlapping data between them. // ibm_dt_df.read("data/DT_IBM.csv", io_format::csv2); aapl_dt_df.read("data/DT_AAPL.csv", io_format::csv2); // First let’s make sure if there are missing data in our important columns, we fill them up. // - ibm_dt_df.fill_missing({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" }, - fill_policy::linear_interpolate); + ibm_dt_df.fill_missing({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" }, fill_policy::linear_interpolate); // Now we join the AAPL and IBM DataFrames using their indices and applying inner-join policy. // - DTDataFrame aapl_ibm = - ibm_dt_df.join_by_index(aapl_dt_df, join_policy::inner_join); + DTDataFrame aapl_ibm = ibm_dt_df.join_by_index(aapl_dt_df, join_policy::inner_join); - // Now we calculate the Pearson correlation coefficient between AAPL and IBM close prices. - // The visitor's data columns are of type double and its index column is of type DateTime. + // Now we calculate the Pearson correlation coefficient between AAPL and IBM close prices. The visitor's data columns are + // of type double and its index column is of type DateTime. // CorrVisitor corrl_v; @@ -236,9 +216,9 @@ int main(int, char *[]) { << aapl_ibm.visit("AAPL_Close", "IBM_Close", corrl_v).get_result() << std::endl; - // Now let’s do something more sophisticated and calculate rolling exponentially weighted correlations - // between IBM and Apple close prices. Since this is a rolling -- moving -- analysis the result is a - // vector of exponentially weighted correlations for each date in the data stream. + // Now let’s do something more sophisticated and calculate rolling exponentially weighted correlations between IBM and + // Apple close prices. Since this is a rolling -- moving -- analysis the result is a vector of exponentially weighted + // correlations for each date in the data stream. // ewm_corr_v ewmcorr { exponential_decay_spec::span, 3 }; const auto &ewmcorr_result = @@ -264,8 +244,7 @@ int main(int, char *[]) { std::make_tuple("AAPL_Close", "Median", MedianVisitor()), std::make_tuple("AAPL_Close", "25% Quantile", QuantileVisitor(0.25)), std::make_tuple("AAPL_Close", "Std", StdVisitor()), - // "Mode" column is a column of std::array's -- It cannot be printed - // by default + // "Mode" column is a column of std::array's std::make_tuple("AAPL_Close", "Mode", ModeVisitor<2, double, dt_idx_t>()), std::make_tuple("AAPL_Close", "MAD", MADVisitor(mad_type::mean_abs_dev_around_mean)), // "Z Score" column is a column of std::vector's @@ -285,11 +264,9 @@ int main(int, char *[]) { // --------------------------------------------------- // // Now let’s do some stuff that are a little more involved (multi steps). There are a lot of theories, - // math, and procedures that I am skipping to explain here. - // See docs for more details. + // math, and procedures that I am skipping to explain here. See docs for more details. // - // NOTE: I am applying the following analysis to financial data but it equally applies to other - // scientific fields. + // NOTE: I am applying the following analysis to financial data but it equally applies to other scientific fields. // // --------------------------------------------------- @@ -339,19 +316,18 @@ int main(int, char *[]) { << "75% quantile: " << qt75.get_result() << ", " << "95% quantile: " << qt95.get_result() << std::endl; - // Now let’s do another interesting thing. Let’s take the IBM returns curve and split it into - // 3 different curves; Trend, Seasonal, and Idiocentric or Residual or Random. - // For the sake of this exercise, we assume IBM business goes through 170-day seasonal cycles. + // Now let’s do another interesting thing. Let’s take the IBM returns curve and split it into 3 different curves; Trend, + // Seasonal, and Idiocentric or Residual or Random. For the sake of this exercise, we assume IBM business goes through + // 170-day seasonal cycles. // DecomposeVisitor decom { 170, 0.6, 0.01 }; - // After this call, the 3 curves will be in decom visitor instance. See docs how to get them - // and analyze them. + // After this call, the 3 curves will be in decom visitor instance. See docs how to get them and analyze them. // ibm_dt_df.single_act_visit("IBM_Return", decom); - // But what if you don’t know the seasonality of IBM returns which would be most of the time. - // No worries, Mr Joseph Fourier comes to the rescue. + // But what if you don’t know the seasonality of IBM returns which would be most of the time. No worries, + // Mr. Joseph Fourier comes to the rescue. // FastFourierTransVisitor fft; @@ -360,8 +336,8 @@ int main(int, char *[]) { const auto &magnitudes = fft.get_magnitude(); double max_val = 0; - // The following analysis and conclusion are over simplified and naive. It is more involved - // which is behind the scope of Hello World. But this is the basic idea. + // The following analysis and conclusion are over simplified and naive. It is more involved which is behind the scope of + // Hello World. But this is the basic idea. // for (std::size_t i = 1; i < magnitudes.size(); ++i) { const double val = 1.0 / magnitudes[i];