diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..cd86de2ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/build +*.pyc +/src/_compress.c +*.egg +*.so +*.egg-info + +# Coverage +htmlcov +.coverage +coverage.xml +junit.xml diff --git a/.project b/.project new file mode 100644 index 000000000..423105518 --- /dev/null +++ b/.project @@ -0,0 +1,109 @@ + + + arctic + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + + + 1321466330118 + + 10 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.egg-info + + + + 1321466330219 + + 14 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.egg + + + + 1321466330229 + + 26 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-__pycache__ + + + + 1321466330237 + + 6 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-.pydevproject + + + + 1321466330246 + + 22 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.pyc + + + + 1321466330785 + + 10 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.egg-info + + + + 1321466330808 + + 14 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.egg + + + + 1321466330816 + + 26 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-__pycache__ + + + + 1321466330826 + + 6 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-.pydevproject + + + + 1321466330835 + + 22 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-*.pyc + + + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 000000000..fa5607f9d --- /dev/null +++ b/.pydevproject @@ -0,0 +1,8 @@ + + +Default +python 2.7 + +/arctic + + diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..e9ab0b39d --- /dev/null +++ b/LICENSE @@ -0,0 +1,458 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md new file mode 100644 index 000000000..e3c4594ed --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +# [Arctic TimeSeries and Tick store](https://github.com/ahlmss/arctic) + +Arctic is a high performance datastore for numeric data. It supports [Pandas](http://pandas.pydata.org/), +[numpy](http://www.numpy.org/) arrays and pickled objects out-of-the-box, with pluggable support for +other data types and optional versioning. + +Arctic can query millions of rows per second per client, achieves ~10x compression on network bandwidth, +~10x compression on disk, and scales to hundreds of millions of rows per second per +[MongoDB](https://www.mongodb.org/) instance. + +Arctic has been under active development at [Man AHL](http://www.ahl.com/) since 2012. + +## Quickstart + + +### Run a MongoDB + +``` +mongod --dbpath +``` + +### Using VersionStore + +``` +from arctic import Arctic + +# Connect to Local MONGODB +store = Arctic('localhost') + +# Create the library - defaults to VersionStore +store.initialize_library('NASDAQ') + +# Access the library +library = store['NASDAQ'] + +# Load some data - maybe from Quandl +aapl = Quandl.get("NASDAQ/AAPL", authtoken="your token here") + +# Store the data in the library +library.write('AAPL', aapl, metadata={'source': 'Quandl'}) + +# Reading the data +item = library.read('AAPL') +aapl = item.data +metadata = item.metadata +``` + +VersionStore supports much more: [See the HowTo](howtos/how_to_use_arctic.py)! + + +### Adding your own storage engine + +Plugging a custom class in as a library type is straightforward. [This example +shows how.](howtos/how_to_custom_arctic_library.py) + + + +## Concepts + +### Libraries + +Arctic provides namespaced *libraries* of data. These libraries allow +bucketing data by *source*, *user* or some other metric (for example frequency: +End-Of-Day; Minute Bars; etc.). + +Arctic supports multiple data libraries per user. A user (or namespace) +maps to a MongoDB database (the granularity of mongo authentication). The library +itself is composed of a number of collections within the database. Libraries look like: + + * user.EOD + * user.ONEMINUTE + +A library is mapped to a Python class. All library databases in MongoDB are prefixed with 'arctic_' + +### Storage Engines + +Arctic includes two storage engines: + + * [VersionStore](arctic/store/version_store.py): a key-value versioned TimeSeries store. It supports: + * Pandas data types (other Python types pickled) + * Multiple versions of each data item. Can easily read previous versions. + * Create point-in-time snapshots across symbols in a library + * Soft quota support + * Hooks for persisting other data types + * Audited writes: API for saving metadata and data before and after a write. + * a wide range of TimeSeries data frequencies: End-Of-Day to Minute bars + * [See the HowTo](howtos/how_to_use_arctic.py) + * [TickStore](arctic/tickstore/tickstore.py): Column oriented tick database. Supports + dynamic fields, chunks aren't versioned. Designed for large continuously ticking data. + +Arctic storage implementations are **pluggable**. VersionStore is the default. + + +## Requirements + +Arctic currently works with: + + * Python 2.7 + * pymongo >= 3.0 + * Pandas + * MongoDB >= 2.4.x + + +## Acknowledgements + +Arctic has been under active development at [Man AHL](http://www.ahl.com/) since 2012. + +It wouldn't be possible without the work of the AHL Data Engineering Team including: + + * [Richard Bounds](https://github.com/richardbounds) + * [James Blackburn](https://github.com/jamesblackburn) + * [Vlad Mereuta](https://github.com/vmereuta) + * Tom Taylor + * Tope Olukemi + * Drake Siard + * ... and many others ... + +Contributions welcome! + +## License + +Arctic is licensed under the GNU LGPL v2.1. A copy of which is included in [LICENSE](LICENSE) diff --git a/arctic/__init__.py b/arctic/__init__.py new file mode 100644 index 000000000..1eb3bb06a --- /dev/null +++ b/arctic/__init__.py @@ -0,0 +1,12 @@ +""" The Arctic TimeSeries and Tick store.""" + +from .arctic import Arctic, register_library_type +from .arctic import VERSION_STORE, TICK_STORE +from .store.version_store import register_versioned_storage +from .store._pandas_ndarray_store import PandasDataFrameStore, PandasSeriesStore, PandasPanelStore +from .store._ndarray_store import NdarrayStore + +register_versioned_storage(PandasDataFrameStore) +register_versioned_storage(PandasSeriesStore) +register_versioned_storage(PandasPanelStore) +register_versioned_storage(NdarrayStore) diff --git a/arctic/_compression.py b/arctic/_compression.py new file mode 100644 index 000000000..ebd1c5266 --- /dev/null +++ b/arctic/_compression.py @@ -0,0 +1,78 @@ +from .logging import logger +import _compress as clz4 + + +USE_LZ4HC = True # switch to use LZ4HC. Default True +LZ4HC_N_PARALLEL = 5 # No. of elements to use parellel compression in LZ4HC mode +LZ4_N_PARALLEL = 50 # No. of elements to use parellel compression in LZ4 mode + + +def use_lz4hc(mode): + """ + Set the global LZ4HC mode + + Parameters + ---------- + mode: `bool` + True: Use LZ4HC False: Use LZ4 + """ + global USE_LZ4HC + USE_LZ4HC = mode + logger.info("Setting compression mode to %s" % ("LZ4HC" if mode else "LZ4 (no HC)")) + + +def _should_use_lz4hc(): + return USE_LZ4HC + + +def _is_interactive_mode(): + # http://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode + # currently unused - but could in-future flip to LZ4 if in interactive mode + import __main__ as main + return not hasattr(main, '__file__') + + +def compress_array(str_list): + """ + Compress an array of strings + + By default LZ4 mode is standard in interactive mode, + and high compresion in applications/scripts + """ + if _should_use_lz4hc(): + # Less than 5 chunks its quicker to compress sequentially.. + if len(str_list) > LZ4HC_N_PARALLEL: + return clz4.compressarrHC(str_list) + else: + return [clz4.compressHC(s) for s in str_list] + else: + # Less than 50 chunks its quicker to compress sequentially.. + if len(str_list) > LZ4_N_PARALLEL: + return clz4.compressarr(str_list) + else: + return [clz4.compress(s) for s in str_list] + + +def compress(_str): + """ + Compress a string + + By default LZ4 mode is standard in interactive mode, + and high compresion in applications/scripts + """ + compressfn = clz4.compressHC if _should_use_lz4hc() else clz4.compress + return compressfn(_str) + + +def decompress(_str): + """ + Decompress a string + """ + return clz4.decompress(_str) + + +def decompress_array(str_list): + """ + Decompress a list of strings + """ + return clz4.decompressarr(str_list) diff --git a/arctic/_util.py b/arctic/_util.py new file mode 100644 index 000000000..2141908f8 --- /dev/null +++ b/arctic/_util.py @@ -0,0 +1,54 @@ +from datetime import datetime +from pandas import DataFrame +from pandas.util.testing import assert_frame_equal +from pymongo.errors import OperationFailure +import string + +from .logging import logger + + +def indent(s, num_spaces): + s = string.split(s, '\n') + s = [(num_spaces * ' ') + line for line in s] + s = string.join(s, '\n') + return s + + +def are_equals(o1, o2, **kwargs): + try: + if isinstance(o1, DataFrame): + assert_frame_equal(o1, o2, kwargs) + return True + return o1 == o2 + except Exception: + return False + + +def enable_sharding(arctic, library_name, hashed=False): + c = arctic._conn + lib = arctic[library_name]._arctic_lib + dbname = lib._db.name + library_name = lib.get_top_level_collection().name + try: + c.admin.command('enablesharding', dbname) + except OperationFailure, e: + if not 'failed: already enabled' in str(e): + raise + if not hashed: + logger.info("Range sharding 'symbol' on: " + dbname + '.' + library_name) + c.admin.command('shardCollection', dbname + '.' + library_name, key={'symbol': 1}) + else: + logger.info("Hash sharding 'symbol' on: " + dbname + '.' + library_name) + c.admin.command('shardCollection', dbname + '.' + library_name, key={'symbol': 'hashed'}) + + +def enable_powerof2sizes(arctic, library_name): + lib = arctic[library_name]._arctic_lib + collection = lib.get_top_level_collection() + lib._db.command({"collMod" : collection.name, 'usePowerOf2Sizes': "true"}) + logger.info("usePowerOf2Sizes enabled for %s", collection.name) + + for coll in collection.database.collection_names(): + if coll.startswith("%s." % collection.name): + lib._db.command({"collMod" : coll, 'usePowerOf2Sizes': "true"}) + logger.info("usePowerOf2Sizes enabled for %s", coll) diff --git a/arctic/arctic.py b/arctic/arctic.py new file mode 100644 index 000000000..e5864c16b --- /dev/null +++ b/arctic/arctic.py @@ -0,0 +1,444 @@ +import pymongo +from pymongo.errors import OperationFailure, AutoReconnect +from pymongo.read_preferences import ReadPreference + +from .auth import authenticate, get_auth +from .hooks import get_mongodb_uri +from .logging import logger +from .decorators import mongo_retry +from ._util import indent + +from .exceptions import LibraryNotFoundException, ArcticException, QuotaExceededException +from .store import version_store +from .tickstore import tickstore +from .tickstore import toplevel + +__all__ = ['Arctic', 'VERSION_STORE', 'TICK_STORE', 'register_library_type'] + +# Default Arctic application name: 'arctic' +APPLICATION_NAME = 'arctic' +VERSION_STORE = version_store.VERSION_STORE_TYPE +TICK_STORE = tickstore.TICK_STORE_TYPE +LIBRARY_TYPES = {version_store.VERSION_STORE_TYPE: version_store.VersionStore, + tickstore.TICK_STORE_TYPE: tickstore.TickStore, + toplevel.TICK_STORE_TYPE: toplevel.TopLevelTickStore + } + + +def register_library_type(name, type_): + """ + Register a Arctic Library Type handler + """ + if name in LIBRARY_TYPES: + raise ArcticException("Library %s already registered as %s" % (name, LIBRARY_TYPES[name])) + LIBRARY_TYPES[name] = type_ + + +class Arctic(object): + """ + The Arctic class is a top-level God object, owner of all arctic_ databases + accessible in Mongo. + Each database contains one or more ArcticLibrarys which may have implementation + specific functionality. + + Current Mongo Library types: + - arctic.VERSION_STORE - Versioned store for chunked Pandas and numpy objects + (other Python types are pickled) + - arctic.TICK_STORE - Tick specific library. Supports 'snapshots', efficiently + stores updates, not versioned. + + Arctic and ArcticLibrary are responsible for Connection setup, authentication, + dispatch to the appropriate library implementation, and quotas. + """ + DB_PREFIX = 'arctic' + METADATA_COLL = "ARCTIC" + METADATA_DOC_ID = "ARCTIC_META" + + _MAX_CONNS = 4 + __conn = None + + def __init__(self, mongo_host, app_name=APPLICATION_NAME, allow_secondary=False, + socketTimeoutMS=10 * 60 * 1000, connectTimeoutMS=2 * 1000, + serverSelectionTimeoutMS=30 * 1000): + """ + Constructs a Arctic Datastore. + + Parameters: + ----------- + mongo_host: A MongoDB hostname, alias or Mongo Connection + + app_name: `str` is the name of application used for resolving credentials when + authenticating against the mongo_host. + We will fetch credentials using the authentication hook. + Teams should override this such that different applications don't accidentally + run with privileges to other applications' databases + + allow_secondary: `bool` indicates if we allow reads against + secondary members in the cluster. These reads may be + a few seconds behind (but are usually split-second up-to-date). + + serverSelectionTimeoutMS: `int` the main tunable used for configuring how long + the pymongo driver will spend on MongoDB cluster discovery. This parameter + takes precedence over connectTimeoutMS: https://jira.mongodb.org/browse/DRIVERS-222 + + """ + self._application_name = app_name + self._library_cache = {} + self._allow_secondary = allow_secondary + self._socket_timeout = socketTimeoutMS + self._connect_timeout = connectTimeoutMS + self._server_selection_timeout = serverSelectionTimeoutMS + + if isinstance(mongo_host, basestring): + self.mongo_host = mongo_host + else: + self.__conn = mongo_host + # Workaround for: https://jira.mongodb.org/browse/PYTHON-927 + mongo_host.server_info() + self.mongo_host = ",".join(["{}:{}".format(x[0], x[1]) for x in mongo_host.nodes]) + self._adminDB = self._conn.admin + + @property + def _conn(self): + if self.__conn is None: + host = get_mongodb_uri(self.mongo_host) + logger.info("Connecting to mongo: {0} ({1})".format(self.mongo_host, host)) + self.__conn = mongo_retry(pymongo.MongoClient)(host=host, + maxPoolSize=self._MAX_CONNS, + socketTimeoutMS=self._socket_timeout, + connectTimeoutMS=self._connect_timeout, + serverSelectionTimeoutMS=self._server_selection_timeout) + self._adminDB = self.__conn.admin + + # Authenticate against admin for the user + auth = get_auth(self.mongo_host, self._application_name, 'admin') + if auth: + authenticate(self._adminDB, auth.user, auth.password) + + # Accessing _conn is synchronous. The new PyMongo driver may be lazier than the previous. + # Force a connection. + self.__conn.server_info() + + return self.__conn + + def __str__(self): + return "" % (hex(id(self)), str(self._conn)) + + def __repr__(self): + return str(self) + + def __getstate__(self): + return {'mongo_host': self.mongo_host, 'allow_secondary': self._allow_secondary} + + def __setstate__(self, state): + return Arctic.__init__(self, **state) + + @mongo_retry + def list_libraries(self): + """ + Returns + ------- + list of Arctic library names + """ + libs = [] + for db in self._conn.database_names(): + if db.startswith(self.DB_PREFIX + '_'): + for coll in self._conn[db].collection_names(): + if coll.endswith(self.METADATA_COLL): + libs.append(db[len(self.DB_PREFIX) + 1:] + "." + coll[:-1 * len(self.METADATA_COLL) - 1]) + elif db == self.DB_PREFIX: + for coll in self._conn[db].collection_names(): + if coll.endswith(self.METADATA_COLL): + libs.append(coll[:-1 * len(self.METADATA_COLL) - 1]) + return libs + + @mongo_retry + def initialize_library(self, library, lib_type=VERSION_STORE, **kwargs): + """ + Create an Arctic Library or a particular type. + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + + lib_type : `str` + The type of the library. e.g. arctic.VERSION_STORE or arctic.TICK_STORE + Or any type registered with register_library_type + Default: arctic.VERSION_STORE + + kwargs : + Arguments passed to the Library type for initialization. + """ + l = ArcticLibraryBinding(self, library) + # Check that we don't create too many namespaces + if len(self._conn[l.database_name].collection_names()) > 3000: + raise ArcticException("Too many namespaces %s, not creating: %s" % + (len(self._conn[l.database_name].collection_names()), library)) + l.set_library_type(lib_type) + LIBRARY_TYPES[lib_type].initialize_library(l, **kwargs) + # Add a 10G quota just in case the user is calling this with API. + if not l.get_quota(): + l.set_quota(10 * 1024 * 1024 * 1024) + + @mongo_retry + def delete_library(self, library): + """ + Delete an Arctic Library, and all associated collections in the MongoDB. + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + """ + l = ArcticLibraryBinding(self, library) + colname = l.get_top_level_collection().name + logger.info('Dropping collection: %s' % colname) + l._db.drop_collection(colname) + for coll in l._db.collection_names(): + if coll.startswith(colname + '.'): + logger.info('Dropping collection: %s' % coll) + l._db.drop_collection(coll) + if library in self._library_cache: + del self._library_cache[library] + del self._library_cache[l.get_name()] + + def get_library(self, library): + """ + Return the library instance. Can generally use slicing to return the library: + arctic_store[library] + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + """ + if library in self._library_cache: + return self._library_cache[library] + + try: + error = None + l = ArcticLibraryBinding(self, library) + lib_type = l.get_library_type() + except (OperationFailure, AutoReconnect), e: + error = e + + if error or not lib_type: + raise LibraryNotFoundException("Library %s was not correctly initialized in %s.\nReason: %s" % (library, self, error)) + elif lib_type not in LIBRARY_TYPES: + raise LibraryNotFoundException("Couldn't load LibraryType '%s' for '%s' (has the class been registered?)" % + (lib_type, library)) + instance = LIBRARY_TYPES[lib_type](l) + self._library_cache[library] = instance + # The library official name may be different from 'library': e.g. 'library' vs 'user.library' + self._library_cache[l.get_name()] = instance + return self._library_cache[library] + + def __getitem__(self, key): + if isinstance(key, basestring): + return self.get_library(key) + else: + raise ArcticException("Unrecognised library specification - use [libraryName]") + + def set_quota(self, library, quota): + """ + Set a quota (in bytes) on this user library. The quota is 'best effort', + and should be set conservatively. + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + + quota : `int` + Advisory quota for the library - in bytes + """ + l = ArcticLibraryBinding(self, library) + l.set_quota(quota) + + def get_quota(self, library): + """ + Return the quota currently set on the library. + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + """ + l = ArcticLibraryBinding(self, library) + return l.get_quota() + + def check_quota(self, library): + """ + Check the quota on the library, as would be done during normal writes. + + Parameters + ---------- + library : `str` + The name of the library. e.g. 'library' or 'user.library' + + Raises + ------ + arctic.exceptions.QuotaExceededException if the quota has been exceeded + """ + l = ArcticLibraryBinding(self, library) + l.check_quota() + + +class ArcticLibraryBinding(object): + """ + The ArcticLibraryBinding type holds the binding between the library name and the + concrete implementation of the library. + + Also provides access to additional metadata about the library + - Access to the library's top-level collection + - Enforces quota on the library + - Access to custom metadata about the library + """ + DB_PREFIX = Arctic.DB_PREFIX + TYPE_FIELD = "TYPE" + QUOTA = 'QUOTA' + + quota = None + quota_countdown = 0 + + @classmethod + def _parse_db_lib(clz, library): + """ + Returns the canonical (database_name, library) for the passed in + string 'library'. + """ + database_name = library.split('.', 2) + if len(database_name) == 2: + library = database_name[1] + if database_name[0].startswith(clz.DB_PREFIX): + database_name = database_name[0] + else: + database_name = clz.DB_PREFIX + '_' + database_name[0] + else: + database_name = clz.DB_PREFIX + return database_name, library + + def __init__(self, arctic, library): + self.arctic = arctic + database_name, library = self._parse_db_lib(library) + self.library = library + self.database_name = database_name + self._db = self.arctic._conn[database_name] + self._auth(self._db) + self._library_coll = self._db[library] + + def __str__(self): + return """ +%s""" % (hex(id(self)), self._db.name, self._library_coll.name, indent(str(self.arctic), 4)) + + def __repr__(self): + return str(self) + + def __getstate__(self): + return {'arctic': self.arctic, 'library': '.'.join([self.database_name, self.library])} + + def __setstate__(self, state): + return ArcticLibraryBinding.__init__(self, state['arctic'], state['library']) + + @mongo_retry + def _auth(self, database): + #Get .mongopass details here + if not hasattr(self.arctic, 'mongo_host'): + return + + auth = get_auth(self.arctic.mongo_host, self.arctic._application_name, database.name) + if auth: + authenticate(self._db, auth.user, auth.password) + self.arctic._conn.close() + + def get_name(self): + return self._db.name + '.' + self._library_coll.name + + def get_top_level_collection(self): + return self._library_coll + + def set_quota(self, quota_bytes): + """ + Set a quota (in bytes) on this user library. The quota is 'best effort', + and should be set conservatively. + + A quota of 0 is 'unlimited' + """ + self.set_library_metadata(ArcticLibraryBinding.QUOTA, quota_bytes) + self.quota = quota_bytes + self.quota_countdown = 0 + + def get_quota(self): + """ + Get the current quota on this user library. + """ + return self.get_library_metadata(ArcticLibraryBinding.QUOTA) + + def check_quota(self): + """ + Check whether the user is within quota. Should be called before + every write. Will raise() if the library has exceeded its allotted + quota. + """ + # Don't check on every write + if self.quota is None: + self.quota = self.get_library_metadata(ArcticLibraryBinding.QUOTA) + if self.quota is None: + self.quota = 0 + + if self.quota == 0: + return + + # Don't check on every write, that would be slow + if self.quota_countdown > 0: + self.quota_countdown -= 1 + return + + # Figure out whether the user has exceeded their quota + library = self.arctic[self.get_name()] + stats = library.stats() + + def to_gigabytes(bytes): + return bytes / 1024. / 1024. / 1024. + + # Have we exceeded our quota? + size = stats['totals']['size'] + count = stats['totals']['count'] + if size >= self.quota: + raise QuotaExceededException("Quota Exceeded: %.3f / %.0f GB used" % + (to_gigabytes(size), + to_gigabytes(self.quota))) + + # Quota not exceeded, print an informational message and return + avg_size = size / count if count > 1 else 100 * 1024 + remaining = self.quota - size + remaining_count = remaining / avg_size + if remaining_count < 100: + logger.warn("Mongo Quota: %.3f / %.0f GB used" % (to_gigabytes(size), + to_gigabytes(self.quota))) + else: + logger.info("Mongo Quota: %.3f / %.0f GB used" % (to_gigabytes(size), + to_gigabytes(self.quota))) + + # Set-up a timer to prevent us for checking for a few writes. + self.quota_countdown = max(remaining_count / 2, 1) + + def get_library_type(self): + return self.get_library_metadata(ArcticLibraryBinding.TYPE_FIELD) + + def set_library_type(self, lib_type): + self.set_library_metadata(ArcticLibraryBinding.TYPE_FIELD, lib_type) + + @mongo_retry + def get_library_metadata(self, field): + lib_metadata = self._library_coll[self.arctic.METADATA_COLL].find_one({"_id": self.arctic.METADATA_DOC_ID}) + if lib_metadata is not None: + return lib_metadata.get(field) + else: + return None + + @mongo_retry + def set_library_metadata(self, field, value): + self._library_coll[self.arctic.METADATA_COLL].update_one({'_id': self.arctic.METADATA_DOC_ID}, + {'$set': {field: value}}, upsert=True) diff --git a/arctic/auth.py b/arctic/auth.py new file mode 100644 index 000000000..ec0786871 --- /dev/null +++ b/arctic/auth.py @@ -0,0 +1,28 @@ +from collections import namedtuple + +from .logging import logger + + +def authenticate(db, user, password): + """ + Return True / False on authentication success. + + PyMongo 2.6 changed the auth API to raise on Auth failure. + """ + from pymongo.errors import PyMongoError + try: + logger.debug("Authenticating {} with {}".format(db, user)) + return db.authenticate(user, password) + except PyMongoError, e: + logger.debug("Auth Error %s" % e) + return False + + +Credential = namedtuple("MongoCredentials", ['database', 'user', 'password']) + + +def get_auth(host, app_name, database_name): + """ + Authentication hook to allow plugging in custom authentication credential providers + """ + return None diff --git a/arctic/date/__init__.py b/arctic/date/__init__.py new file mode 100644 index 000000000..709df92b7 --- /dev/null +++ b/arctic/date/__init__.py @@ -0,0 +1,5 @@ +from ._daterange import DateRange +from ._generalslice import OPEN_CLOSED, CLOSED_OPEN, OPEN_OPEN, CLOSED_CLOSED +from ._util import datetime_to_ms, ms_to_datetime +from ._util import string_to_daterange, to_pandas_closed_closed +from ._mktz import mktz, TimezoneError diff --git a/arctic/date/_daterange.py b/arctic/date/_daterange.py new file mode 100644 index 000000000..df494f34d --- /dev/null +++ b/arctic/date/_daterange.py @@ -0,0 +1,194 @@ +import datetime +from datetime import timedelta +from dateutil.tz import tzlocal + +from ..logging import logger +from ._generalslice import OPEN_OPEN, CLOSED_CLOSED, OPEN_CLOSED, CLOSED_OPEN, GeneralSlice +from ._parse import parse + + +INTERVAL_LOOKUP = {(True, True): OPEN_OPEN, + (False, False): CLOSED_CLOSED, + (True, False): OPEN_CLOSED, + (False, True): CLOSED_OPEN + } + + +class DateRange(GeneralSlice): + """ + Represents a bounded datetime range. + + Ranges may be bounded on either end if a date is + specified for the start or end of the range, or unbounded + if None is specified for either value. Unbounded ranges will allow + all available data to pass through when used as a filter argument + on function or method. + + ===== ==== ============================ =============================== + start end interval Meaning + ----- ---- ---------------------------- ------------------------------- + None None any date + a None CLOSED_CLOSED or CLOSED_OPEN date >= a + a None OPEN_CLOSED or OPEN_OPEN date > a + None b CLOSED_CLOSED or OPEN_CLOSED date <= b + None b CLOSED_OPEN or OPEN_OPEN date < b + a b CLOSED_CLOSED date >= a and date <= b + a b OPEN_CLOSED date > a and date <= b + a b CLOSED_OPEN date >= a and date < b + a b OPEN_OPEN date > a and date < b + ===== ==== ============================ =============================== + + Parameters + ---------- + start : `int`, `str` or `datetime.datetime` + lower bound date value as an integer, string or datetime object. + + end : `int`, `str` or `datetime.datetime` + upper bound date value as an integer, string or datetime object. + + interval : `int` + CLOSED_CLOSED, OPEN_CLOSED, CLOSED_OPEN or OPEN_OPEN. + **Default is CLOSED_CLOSED**. + """ + def __init__(self, start=None, end=None, interval=CLOSED_CLOSED): + + def _is_dt_type(x): + return isinstance(x, (datetime.datetime, datetime.date)) + + def _compute_bound(value, desc): + if isinstance(value, (int, str)): + return parse(str(value)) + elif _is_dt_type(value): + return value + elif value is None: + return None + else: + raise TypeError('unsupported type for %s: %s' % (desc, type(value))) + + super(DateRange, self).__init__(_compute_bound(start, "start"), _compute_bound(end, "end"), 1, interval) + + if _is_dt_type(self.start) and _is_dt_type(self.end): + if self.start > self.end: + raise ValueError('start date (%s) cannot be greater than end date (%s)!' + % (self.start, self.end)) + + @property + def unbounded(self): + """True if range is unbounded on either or both ends, False otherwise.""" + return self.start is None or self.end is None + + def intersection(self, other): + """ + Create a new DateRange representing the maximal range enclosed by this range and other + """ + startopen = other.startopen if self.start is None \ + else self.startopen if other.start is None \ + else other.startopen if self.start < other.start \ + else self.startopen if self.start > other.start \ + else (self.startopen and other.startopen) + endopen = other.endopen if self.end is None \ + else self.endopen if other.end is None \ + else other.endopen if self.end > other.end \ + else self.endopen if self.end < other.end \ + else (self.endopen and other.endopen) + + new_start = self.start if other.start is None \ + else other.start if self.start is None \ + else max(self.start, other.start) + new_end = self.end if other.end is None \ + else other.end if self.end is None \ + else min(self.end, other.end) + + interval = INTERVAL_LOOKUP[(startopen, endopen)] + + return DateRange(new_start, new_end, interval) + + def as_dates(self): + """ + Create a new DateRange with the datetimes converted to dates and changing to CLOSED/CLOSED. + """ + new_start = self.start.date() if self.start and isinstance(self.start, datetime.datetime) else self.start + new_end = self.end.date() if self.end and isinstance(self.end, datetime.datetime) else self.end + return DateRange(new_start, new_end, CLOSED_CLOSED) + + def mongo_query(self): + """ + Convert a DateRange into a MongoDb query string. FIXME: Mongo can only handle + datetimes in queries, so we should make this handle the case where start/end are + datetime.date and extend accordingly (being careful about the interval logic). + """ + comps = {OPEN_CLOSED: ('t', 'te'), OPEN_OPEN: ('t', 't'), + CLOSED_OPEN: ('te', 't'), CLOSED_CLOSED: ('te', 'te')} + query = {} + comp = comps[self.interval] + if self.start: + query['$g' + comp[0]] = self.start + if self.end: + query['$l' + comp[1]] = self.end + return query + + def get_date_bounds(self): + """ + Return the upper and lower bounds along + with operators that are needed to do an 'in range' test. + Useful for SQL commands. + + Returns + ------- + tuple: (`str`, `date`, `str`, `date`) + (date_gt, start, date_lt, end) + e.g.: + ('>=', start_date, '<', end_date) + """ + start = end = None + date_gt = '>=' + date_lt = '<=' + if self: + if self.start: + start = self.start + if self.end: + end = self.end + if self.startopen: + date_gt = '>' + if self.endopen: + date_lt = '<' + + return date_gt, start, date_lt, end + + def __contains__(self, d): + if self.interval == CLOSED_CLOSED: + return (self.start is None or d >= self.start) and (self.end is None or d <= self.end) + elif self.interval == CLOSED_OPEN: + return (self.start is None or d >= self.start) and (self.end is None or d < self.end) + elif self.interval == OPEN_CLOSED: + return (self.start is None or d > self.start) and (self.end is None or d <= self.end) + + return (self.start is None or d > self.start) and (self.end is None or d < self.end) + + def __repr__(self): + return 'DateRange(start=%r, end=%r)' % (self.start, self.end) + + def __eq__(self, rhs): + if rhs is None or not (hasattr(rhs, "end") and hasattr(rhs, "start")): + return False + return self.end == rhs.end and self.start == rhs.start + + def __hash__(self): + return hash((self.start, self.end, self.step, self.interval)) + + def __getitem__(self, key): + if key == 0: + return self.start + elif key == 1: + return self.end + else: + raise IndexError('Index %s not in range (0:1)' % key) + + __str__ = __repr__ + + def __setstate__(self, state): + """Called by pickle, PyYAML etc to set state.""" + self.start = state['start'] + self.end = state['end'] + self.interval = state.get('interval') or CLOSED_CLOSED + self.step = 1 diff --git a/arctic/date/_generalslice.py b/arctic/date/_generalslice.py new file mode 100644 index 000000000..3fe1f99cc --- /dev/null +++ b/arctic/date/_generalslice.py @@ -0,0 +1,43 @@ +from enum import Enum + + +class Intervals(Enum): + (OPEN_CLOSED, CLOSED_OPEN, OPEN_OPEN, CLOSED_CLOSED) = range(1101, 1105) +(OPEN_CLOSED, CLOSED_OPEN, OPEN_OPEN, CLOSED_CLOSED) = INTERVALS = Intervals.__members__.values() + + +class GeneralSlice(object): + """General slice object, supporting open/closed ranges: + + ===== ==== ============================ =============================== + start end interval Meaning + ----- ---- ---------------------------- ------------------------------- + None None any item + a None CLOSED_CLOSED or CLOSED_OPEN item >= a + a None OPEN_CLOSED or OPEN_OPEN item > a + None b CLOSED_CLOSED or OPEN_CLOSED item <= b + None b CLOSED_OPEN or OPEN_OPEN item < b + a b CLOSED_CLOSED item >= a and item <= b + a b OPEN_CLOSED item > a and item <= b + a b CLOSED_OPEN item >= a and item < b + a b OPEN_OPEN item > a and item < b + ===== ==== ============================ =============================== + """ + + def __init__(self, start, end, step=None, interval=CLOSED_CLOSED): + self.start = start + self.end = end + self.step = step + self.interval = interval + + @property + def startopen(self): + """True if the start of the range is open (item > start), + False if the start of the range is closed (item >= start).""" + return self.interval in (OPEN_CLOSED, OPEN_OPEN) + + @property + def endopen(self): + """True if the end of the range is open (item < end), + False if the end of the range is closed (item <= end).""" + return self.interval in (CLOSED_OPEN, OPEN_OPEN) diff --git a/arctic/date/_mktz.py b/arctic/date/_mktz.py new file mode 100644 index 000000000..284f8905f --- /dev/null +++ b/arctic/date/_mktz.py @@ -0,0 +1,76 @@ +import bisect +import os +import dateutil +from decorator import decorator +import time +import tzlocal + +DEFAULT_TIME_ZONE_NAME = tzlocal.get_localzone().zone # 'Europe/London' +TIME_ZONE_DATA_SOURCE = '/usr/share/zoneinfo/' + + +class TimezoneError(Exception): + pass + + +class tzfile(dateutil.tz.tzfile): + + def _find_ttinfo(self, dtm, laststd=0): + """Faster version of parent class's _find_ttinfo() as this uses bisect rather than a linear search.""" + if dtm is None: + # This will happen, for example, when a datetime.time object gets utcoffset() called. + raise ValueError('tzinfo object can not calculate offset for date %s' % dtm) + ts = ((dtm.toordinal() - dateutil.tz.EPOCHORDINAL) * 86400 + + dtm.hour * 3600 + + dtm.minute * 60 + + dtm.second) + idx = bisect.bisect_right(self._trans_list, ts) + if len(self._trans_list) == 0 or idx == len(self._trans_list): + return self._ttinfo_std + if idx == 0: + return self._ttinfo_before + if laststd: + while idx > 0: + tti = self._trans_idx[idx - 1] + if not tti.isdst: + return tti + idx -= 1 + else: + return self._ttinfo_std + else: + return self._trans_idx[idx - 1] + + +def mktz(zone=None): + """ + Return a new timezone based on the zone using the python-dateutil + package. This convenience method is useful for resolving the timezone + names as dateutil.tz.tzfile requires the full path. + + The concise name 'mktz' is for convenient when using it on the + console. + + Parameters + ---------- + zone : `String` + The zone for the timezone. This defaults to 'local'. + + Returns + ------- + An instance of a timezone which implements the tzinfo interface. + + Raises + - - - - - - + TimezoneError : Raised if a user inputs a bad timezone name. + """ + + if zone is None: + zone = DEFAULT_TIME_ZONE_NAME + _path = os.path.join(TIME_ZONE_DATA_SOURCE, zone) + try: + tz = tzfile(_path) + except (ValueError, IOError) as err: + raise TimezoneError('Timezone "%s" can not be read, error: "%s"' % (zone, err)) + # Stash the zone name as an attribute (as pytz does) + tz.zone = zone if not zone.startswith(TIME_ZONE_DATA_SOURCE) else zone[len(TIME_ZONE_DATA_SOURCE):] + return tz diff --git a/arctic/date/_parse.py b/arctic/date/_parse.py new file mode 100644 index 000000000..598a09b39 --- /dev/null +++ b/arctic/date/_parse.py @@ -0,0 +1,10 @@ +from dateutil.parser import parse as _parse + + +def parse(string, agnostic=False, **kwargs): + parsed = _parse(string, **kwargs) + if agnostic or (parsed == _parse(string, yearfirst=True, **kwargs) + == _parse(string, dayfirst=True, **kwargs)): + return parsed + else: + raise ValueError("The date was ambiguous: %s" % string) diff --git a/arctic/date/_util.py b/arctic/date/_util.py new file mode 100644 index 000000000..b2f25ea7d --- /dev/null +++ b/arctic/date/_util.py @@ -0,0 +1,123 @@ +import calendar +import datetime +from datetime import timedelta + +from ..logging import logger +from ._daterange import DateRange +from ._generalslice import OPEN_OPEN, CLOSED_CLOSED, OPEN_CLOSED, CLOSED_OPEN, GeneralSlice +from ._parse import parse +from ._mktz import mktz + + +# Support standard brackets syntax for open/closed ranges. +Ranges = {'()': OPEN_OPEN, + '(]': OPEN_CLOSED, + '[)': CLOSED_OPEN, + '[]': CLOSED_CLOSED} + + +def string_to_daterange(str_range, delimiter='-', as_dates=False, interval=CLOSED_CLOSED): + """ + Convert a string to a DateRange type. If you put only one date, it generates the + relevant range for just that date or datetime till 24 hours later. You can optionally + use mixtures of []/() around the DateRange for OPEN/CLOSED interval behaviour. + + Parameters + ---------- + str_range : `String` + The range as a string of dates separated by one delimiter. + + delimiter : `String` + The separator between the dates, using '-' as default. + + as_dates : `Boolean` + True if you want the date-range to use datetime.date rather than datetime.datetime. + + interval : `int` + CLOSED_CLOSED, OPEN_CLOSED, CLOSED_OPEN or OPEN_OPEN. + **Default is CLOSED_CLOSED**. + + Returns + ------- + `arctic.date.DateRange` : the DateRange parsed from the string. + + Examples + -------- + >>> from arctic.date import string_to_daterange + >>> string_to_daterange('20111020', as_dates=True) + DateRange(start=datetime.date(2011, 10, 20), end=datetime.date(2011, 10, 21)) + + >>> string_to_daterange('201110201030') + DateRange(start=datetime.datetime(2011, 10, 20, 10, 30), end=datetime.datetime(2011, 10, 21, 10, 30)) + + >>> string_to_daterange('20111020-20120120', as_dates=True) + DateRange(start=datetime.date(2011, 10, 20), end=datetime.date(2012, 1, 20)) + + >>> string_to_daterange('[20111020-20120120)', as_dates=True) + DateRange(start=datetime.date(2011, 10, 20), end=datetime.date(2012, 1, 20)) + """ + num_dates = str_range.count(delimiter) + 1 + if num_dates > 2: + raise ValueError('Too many dates in input string [%s] with delimiter (%s)' % (str_range, delimiter)) + + # Allow the user to use the [date-date), etc. range syntax to specify the interval. + range_mode = Ranges.get(str_range[0] + str_range[-1], None) + if range_mode: + return string_to_daterange(str_range[1:-1], delimiter, as_dates, interval=range_mode) + + if as_dates: + parse_dt = lambda s: parse(s).date() if s else None + else: + parse_dt = lambda s: parse(s) if s else None + if num_dates == 2: + d = [parse_dt(x) for x in str_range.split(delimiter)] + oc = interval + else: + start = parse_dt(str_range) + d = [start, start + datetime.timedelta(1)] + oc = CLOSED_OPEN # Always use closed-open for a single date/datetime. + return DateRange(d[0], d[1], oc) + + +def to_pandas_closed_closed(date_range): + """ + Pandas DateRange slicing is CLOSED-CLOSED inclusive at both ends. + + Returns a date_range with start-end suitable for slicing in pandas. + """ + if not date_range: + return None + start = date_range.start + end = date_range.end + if start: + if date_range.startopen: + start += timedelta(milliseconds=1) + if end: + if date_range.endopen: + end -= timedelta(milliseconds=1) + return DateRange(start, end) + + +def ms_to_datetime(ms, tzinfo=None): + """Convert a millisecond time value to an offset-aware Python datetime object.""" + if not isinstance(ms, (int, long)): + raise TypeError('expected integer, not %s' % type(ms)) + + if tzinfo in (None, mktz()): + return datetime.datetime.fromtimestamp(ms * 1e-3, mktz()).replace(tzinfo=None) + + return datetime.datetime.fromtimestamp(ms * 1e-3, tzinfo) + + +def _add_tzone(dtm): + if dtm.tzinfo is None: + dtm = dtm.replace(tzinfo=mktz()) + return dtm + + +def datetime_to_ms(d): + """Convert a Python datetime object to a millisecond epoch (UTC) time value.""" + try: + return long((calendar.timegm(_add_tzone(d).utctimetuple()) + d.microsecond / 1000000.0) * 1e3) + except AttributeError: + raise TypeError('expect Python datetime object, not %s' % type(d)) diff --git a/arctic/decorators.py b/arctic/decorators.py new file mode 100644 index 000000000..635d0e2e8 --- /dev/null +++ b/arctic/decorators.py @@ -0,0 +1,87 @@ +from datetime import datetime +from functools import wraps +import os +from pymongo.errors import AutoReconnect, OperationFailure, DuplicateKeyError, ServerSelectionTimeoutError +import sys +from time import sleep + +from .logging import logger +from .hooks import _log_exception_hook as _log_exception + +_MAX_RETRIES = 15 + + +def _get_host(store): + ret = {} + if store: + try: + if isinstance(store, (list, tuple)): + store = store[0] + ret['l'] = store._arctic_lib.get_name() + ret['mnodes'] = ["{}:{}".format(h, p) for h, p in store._collection.database.client.nodes] + ret['mhost'] = "{}".format(store._arctic_lib.arctic.mongo_host) + except Exception: + # Sometimes get_name(), for example, fails if we're not connected to MongoDB. + pass + return ret + +_in_retry = False +_retry_count = 0 + + +def mongo_retry(f): + """ + Catch-all decorator that handles AutoReconnect and OperationFailure + errors from PyMongo + """ + log_all_exceptions = 'arctic' in f.__module__ if f.__module__ else False + + @wraps(f) + def f_retry(*args, **kwargs): + global _retry_count, _in_retry + top_level = not _in_retry + _in_retry = True + try: + while True: + try: + return f(*args, **kwargs) + except (DuplicateKeyError, ServerSelectionTimeoutError) as e: + # Re-raise errors that won't go away. + _handle_error(f, e, _retry_count, **_get_host(args)) + raise + except (OperationFailure, AutoReconnect) as e: + _retry_count += 1 + _handle_error(f, e, _retry_count, **_get_host(args)) + except Exception as e: + if log_all_exceptions: + _log_exception(f.__name__, e, _retry_count, **_get_host(args)) + raise + finally: + if top_level: + _in_retry = False + _retry_count = 0 + return f_retry + + +def dump_bad_documents(*document): + """ + Dump bad documents to disk + """ + id = str(document[0]['_id']) + with open('/tmp/mongo_debug_' + str(os.getpid()) + '_' + id + '_' + str(datetime.now()), 'a') as f: + for d in document: + f.write(str(d) + '\n') + + +def _handle_error(f, e, retry_count, **kwargs): + if retry_count > _MAX_RETRIES: + logger.error('Too many retries %s [%s], raising' % (f.__name__, e)) + e.traceback = sys.exc_info()[2] + raise + log_fn = logger.warn if retry_count > 2 else logger.debug + log_fn('%s %s [%s], retrying %i' % (type(e), f.__name__, e, retry_count)) + # Log operation failure errors + _log_exception(f.__name__, e, retry_count, **kwargs) +# if 'unauthorized' in str(e): +# raise + sleep(0.01 * min((3 ** retry_count), 50)) # backoff... diff --git a/arctic/exceptions.py b/arctic/exceptions.py new file mode 100644 index 000000000..d18acc4ec --- /dev/null +++ b/arctic/exceptions.py @@ -0,0 +1,42 @@ +class ArcticException(Exception): + pass + + +class NoDataFoundException(ArcticException): + pass + + +class UnhandledDtypeException(ArcticException): + pass + + +class LibraryNotFoundException(ArcticException): + pass + + +class DuplicateSnapshotException(ArcticException): + pass + + +class StoreNotInitializedException(ArcticException): + pass + + +class OptimisticLockException(ArcticException): + pass + + +class ConcurrentModificationException(ArcticException): + pass + + +class QuotaExceededException(ArcticException): + pass + + +class UnorderedAppendException(ArcticException): + pass + + +class OverlappingDataException(ArcticException): + pass diff --git a/arctic/fixtures/__init__.py b/arctic/fixtures/__init__.py new file mode 100644 index 000000000..44e28ce84 --- /dev/null +++ b/arctic/fixtures/__init__.py @@ -0,0 +1,3 @@ +""" +Common Test fixtures so you don't need them in your own module... +""" diff --git a/arctic/fixtures/arctic.py b/arctic/fixtures/arctic.py new file mode 100644 index 000000000..63313507a --- /dev/null +++ b/arctic/fixtures/arctic.py @@ -0,0 +1,100 @@ +import getpass +import pytest as pytest + +from .. import arctic as m +from ..logging import logger +from ..decorators import mongo_retry +from ..tickstore.tickstore import TICK_STORE_TYPE + +from .mongo import mongo_proc, mongodb + +mongo_proc2 = mongo_proc(executable="mongod", port="?", + params='--nojournal ' + '--noauth ' + '--nohttpinterface ' + '--noprealloc ' + '--nounixsocket ' + '--smallfiles ' + '--syncdelay 0 ' + '--nssize=1 ' + '--quiet ' + ) +mongodb = mongodb('mongo_proc2') + + +# +# TODO: Using mongo_server_session here would be more efficient +# + +@pytest.fixture(scope="function") +def mongo_host(mongo_proc2): + return mongo_proc2.host + ":" + str(mongo_proc2.port) + + +@pytest.fixture(scope="function") +def arctic(mongodb): + logger.info('arctic.fixtures: arctic init()') + mongodb.drop_database('arctic') + mongodb.drop_database('arctic_{}'.format(getpass.getuser())) + arctic = m.Arctic(mongo_host=mongodb) + # Do not add global libraries here: use specific fixtures below. + # Remember, for testing it does not usually matter what your libraries are called. + return arctic + + +# A arctic which allows reads to hit the secondary +@pytest.fixture(scope="function") +def arctic_secondary(mongodb, arctic): + arctic = m.Arctic(mongo_host=mongodb, allow_secondary=True) + return arctic + + +@pytest.fixture(scope="function") +def library_name(): + return 'test.TEST' + + +@pytest.fixture(scope="function") +def user_library_name(): + return "{}.TEST".format(getpass.getuser()) + + +@pytest.fixture(scope="function") +def overlay_library_name(): + return "test.OVERLAY" + + +@pytest.fixture(scope="function") +def library(arctic, library_name): + # Add a single test library + arctic.initialize_library(library_name, m.VERSION_STORE, segment='month') + return arctic.get_library(library_name) + + +@pytest.fixture(scope="function") +def library_secondary(arctic_secondary, library_name): + arctic_secondary.initialize_library(library_name, m.VERSION_STORE, segment='month') + return arctic_secondary.get_library(library_name) + + +@pytest.fixture(scope="function") +def user_library(arctic, user_library_name): + arctic.initialize_library(user_library_name, m.VERSION_STORE, segment='month') + return arctic.get_library(user_library_name) + + +@pytest.fixture(scope="function") +def overlay_library(arctic, overlay_library_name): + """ Overlay library fixture, returns a pair of libs, read-write: ${name} and read-only: ${name}_RAW + """ + rw_name = overlay_library_name + ro_name = '{}_RAW'.format(overlay_library_name) + arctic.initialize_library(rw_name, m.VERSION_STORE, segment='year') + arctic.initialize_library(ro_name, m.VERSION_STORE, segment='year') + return arctic.get_library(rw_name), arctic.get_library(ro_name) + + +@pytest.fixture(scope="function") +def tickstore_lib(arctic, library_name): + arctic.initialize_library(library_name, TICK_STORE_TYPE) + return arctic.get_library(library_name) diff --git a/arctic/fixtures/mongo.py b/arctic/fixtures/mongo.py new file mode 100644 index 000000000..0f059e061 --- /dev/null +++ b/arctic/fixtures/mongo.py @@ -0,0 +1,144 @@ +# Copyright (C) 2013 by Clearcode +# and associates (see AUTHORS). + +# This file is part of pytest-dbfixtures. + +# pytest-dbfixtures is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# pytest-dbfixtures is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with pytest-dbfixtures. If not, see . + +import os +import pytest + +from path import path +from tempfile import mkdtemp + +from pytest_dbfixtures.executors import TCPExecutor +from pytest_dbfixtures.port import get_port +from pytest_dbfixtures.utils import get_config, try_import, get_process_fixture + + +def mongo_proc(executable=None, params=None, host=None, port=None, + logs_prefix=''): + """ + Mongo process factory. + + :param str executable: path to mongod + :param str params: params + :param str host: hostname + :param str port: exact port (e.g. '8000') + or randomly selected port: + '?' - any random available port + '2000-3000' - random available port from a given range + '4002,4003' - random of 4002 or 4003 ports + :param str logs_prefix: prefix for log filename + :rtype: func + :returns: function which makes a mongo process + """ + + @pytest.fixture(scope='function') + def mongo_proc_fixture(request): + """ + #. Get config. + #. Run a ``mongod`` process. + #. Stop ``mongod`` process after tests. + + .. note:: + `mongod `_ + + :param FixtureRequest request: fixture request object + :rtype: pytest_dbfixtures.executors.TCPExecutor + :returns: tcp executor + """ + config = get_config(request) + + # make a temporary directory for tests and delete it + # if tests have been finished + tmp = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'tmp') + if not os.path.exists(tmp): + os.mkdir(tmp) + tmpdir = path(mkdtemp(prefix='mongo_pytest_fixture', dir=tmp)) + request.addfinalizer(lambda: tmpdir.exists() and tmpdir.rmtree()) + + mongo_exec = executable or config.mongo.mongo_exec + mongo_params = params or config.mongo.params + + mongo_host = host or config.mongo.host + mongo_port = get_port(port or config.mongo.port) + + logsdir = path(request.config.getvalue('logsdir')) + mongo_logpath = logsdir / '{prefix}mongo.{port}.log'.format( + prefix=logs_prefix, + port=mongo_port + ) + + mongo_executor = TCPExecutor( + '{mongo_exec} --bind_ip {host} --port {port} --dbpath {dbpath} --logpath {logpath} {params}'.format( # noqa + mongo_exec=mongo_exec, + params=mongo_params, + host=mongo_host, + port=mongo_port, + dbpath=tmpdir, + logpath=mongo_logpath, + ), + host=mongo_host, + port=mongo_port, + ) + mongo_executor.start() + + request.addfinalizer(mongo_executor.stop) + + return mongo_executor + + return mongo_proc_fixture + + +def mongodb(process_fixture_name): + """ + Mongo database factory. + + :param str process_fixture_name: name of the process fixture + :rtype: func + :returns: function which makes a connection to mongo + """ + + @pytest.fixture + def mongodb_factory(request): + """ + #. Get pymongo module and config. + #. Get connection to mongo. + #. Drop collections before and after tests. + + :param FixtureRequest request: fixture request object + :rtype: pymongo.connection.Connection + :returns: connection to mongo database + """ + proc_fixture = get_process_fixture(request, process_fixture_name) + + pymongo, config = try_import('pymongo', request) + + mongo_host = proc_fixture.host + mongo_port = proc_fixture.port + + try: + client = pymongo.MongoClient + except AttributeError: + client = pymongo.Connection + + mongo_conn = client(mongo_host, mongo_port) + + return mongo_conn + + return mongodb_factory + + +__all__ = [mongodb, mongo_proc] diff --git a/arctic/hooks.py b/arctic/hooks.py new file mode 100644 index 000000000..2b80c19bb --- /dev/null +++ b/arctic/hooks.py @@ -0,0 +1,31 @@ + + +_resolve_mongodb_hook = lambda env: env +_log_exception_hook = lambda *args, **kwargs: None + + +def get_mongodb_uri(host): + """ + Return the MongoDB URI for the passed in host-alias / environment. + + Allows an indirection point for mapping aliases to particular + MongoDB instances. + """ + return _resolve_mongodb_hook(host) + + +def register_resolve_mongodb_hook(hook): + global _resolve_mongodb_hook + _mongodb_resolve_hook = hook + + +def log_exception(fn_name, exception, retry_count, **kwargs): + """ + External exception logging hook. + """ + _log_exception_hook(fn_name, exception, retry_count, **kwargs) + + +def register_log_exception_hook(hook): + global _log_exception_hook + _log_exception_hook = hook diff --git a/arctic/hosts.py b/arctic/hosts.py new file mode 100644 index 000000000..5857aceb3 --- /dev/null +++ b/arctic/hosts.py @@ -0,0 +1,58 @@ +""" +Utilities to resolve a string to Mongo host, or a Arctic library. +""" +import ConfigParser +from ConfigParser import NoOptionError, NoSectionError +import os +import re +from weakref import WeakValueDictionary + +from .logging import logger + +__all__ = ['get_arctic_lib', 'get_arctic_for_library'] + + + +# Application environment variables +arctic_cache = WeakValueDictionary() + + +CONNECTION_STR = re.compile(r"(^\w+\.?\w+)@([^\s:]+:?\w+)$") + + +def get_arctic_lib(connection_string, **kwargs): + """ + Returns a mongo library for the given connection string + + Parameters + --------- + connection_string: `str` + Format must be one of the following: + library@trading for known mongo servers + library@hostname:port + + Returns: + -------- + Arctic library + """ + from .arctic import Arctic + m = CONNECTION_STR.match(connection_string) + if not m: + raise ValueError("connection string incorrectly formed: %s" % connection_string) + library, host = m.group(1), m.group(2) + return _get_arctic(host, **kwargs)[library] + + +def _get_arctic(instance, **kwargs): + # Consider any kwargs passed to the Arctic as discriminators for the cache + key = instance, frozenset(kwargs.iteritems()) + + # Don't create lots of Arctic instances + arctic = arctic_cache.get(key, None) + if not arctic: + # Create the instance. Note that Arctic now connects + # lazily so this doesn't connect until on creation. + from .arctic import Arctic + arctic = Arctic(instance, **kwargs) + arctic_cache[key] = arctic + return arctic diff --git a/arctic/logging.py b/arctic/logging.py new file mode 100644 index 000000000..c07c0a6a9 --- /dev/null +++ b/arctic/logging.py @@ -0,0 +1,6 @@ +from __future__ import absolute_import + +import logging as logger + +logger.basicConfig(format='%(asctime)s %(message)s', level='INFO') +logger = logger.getLogger('arctic') diff --git a/arctic/scripts/__init__.py b/arctic/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/arctic/scripts/arctic_copy_data.py b/arctic/scripts/arctic_copy_data.py new file mode 100644 index 000000000..4bac99ffe --- /dev/null +++ b/arctic/scripts/arctic_copy_data.py @@ -0,0 +1,100 @@ +import argparse +import os +import re +from multiprocessing import Pool +import pwd + +from arctic.decorators import _get_host +from arctic.store.audit import ArcticTransaction + +from ..logging import logger +from ..hosts import get_arctic_lib +from ..date import DateRange, to_pandas_closed_closed, CLOSED_OPEN, OPEN_CLOSED + +# Use the UID rather than environment variables for auditing +USER = pwd.getpwuid(os.getuid())[0] + + +def copy_symbols_helper(src, dest, log, force, splice): + def _copy_symbol(symbols): + for symbol in symbols: + with ArcticTransaction(dest, symbol, USER, log) as mt: + existing_data = dest.has_symbol(symbol) + if existing_data: + if force: + logger.warn("Symbol: %s already exists in destination, OVERWRITING" % symbol) + elif splice: + logger.warn("Symbol: %s already exists in destination, splicing in new data" % symbol) + else: + logger.warn("Symbol: {} already exists in {}@{}, use --force to overwrite or --splice to join with existing data".format(symbol, + _get_host(dest).get('l'), + _get_host(dest).get('mhost'))) + continue + + version = src.read(symbol) + new_data = version.data + + if existing_data and splice: + original_data = dest.read(symbol).data + before = original_data.ix[:to_pandas_closed_closed(DateRange(None, new_data.index[0].to_pydatetime(), interval=CLOSED_OPEN)).end] + after = original_data.ix[to_pandas_closed_closed(DateRange(new_data.index[-1].to_pydatetime(), None, interval=OPEN_CLOSED)).start:] + new_data = before.append(new_data).append(after) + + mt.write(symbol, new_data, metadata=version.metadata) + return _copy_symbol + + +def main(): + usage = """ + Copy data from one MongoDB instance to another. + + Example: + arctic_copy_data --log "Copying data" --src user.library@host1 --dest user.library@host2 symbol1 symbol2 + """ + p = argparse.ArgumentParser(usage=usage) + p.add_argument("--src", required=True, help="Source MongoDB like: library@hostname:port") + p.add_argument("--dest", required=True, help="Destination MongoDB like: library@hostname:port") + p.add_argument("--log", required=True, help="Data CR") + p.add_argument("--force", default=False, action='store_true', help="Force overwrite of existing data for symbol.") + p.add_argument("--splice", default=False, action='store_true', help="Keep existing data before and after the new data.") + p.add_argument("--parallel", default=1, type=int, help="Number of imports to run in parallel.") + p.add_argument("symbols", nargs='+', type=str, help="List of symbol regexes to copy from source to dest.") + + opts = p.parse_args() + + src = get_arctic_lib(opts.src) + dest = get_arctic_lib(opts.dest) + + logger.info("Copying data from %s -> %s" % (opts.src, opts.dest)) + + # Prune the list of symbols from the library according to the list of symbols. + required_symbols = set() + for symbol in opts.symbols: + required_symbols.update(src.list_symbols(regex=symbol)) + required_symbols = sorted(required_symbols) + + logger.info("Copying: {} symbols".format(len(required_symbols))) + if len(required_symbols) < 1: + logger.warn("No symbols found that matched those provided.") + return + + # Function we'll call to do the data copying + copy_symbol = copy_symbols_helper(src, dest, opts.log, opts.force, opts.splice) + + if opts.parallel > 1: + logger.info("Starting: {} jobs".format(opts.parallel)) + pool = Pool(processes=opts.parallel) + # Break the jobs into chunks for multiprocessing + chunk_size = len(required_symbols) / opts.parallel + chunk_size = max(chunk_size, 1) + chunks = [required_symbols[offs:offs + chunk_size] for offs in + range(0, len(required_symbols), chunk_size)] + assert sum(len(x) for x in chunks) == len(required_symbols) + pool.apply(copy_symbol, chunks) + else: + copy_symbol(required_symbols) + + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_create_user.py b/arctic/scripts/arctic_create_user.py new file mode 100644 index 000000000..de7e8d13e --- /dev/null +++ b/arctic/scripts/arctic_create_user.py @@ -0,0 +1,61 @@ +import optparse +import pymongo +import uuid +import base64 +import sys + +from ..auth import get_auth, authenticate +from ..hooks import get_mongodb_uri + + +def main(): + usage = """usage: %prog [options] username ... + + Create the user's personal Arctic database, and adds them, read-only + to the central admin database. + """ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_option("--password",dest="password", default=None, help="Password. Default: random") + parser.add_option("--admin-write", dest="admin", action='store_false', default=True, + help="Give write access to the admin DB. Default: False") + parser.add_option("--dryrun", "-n", dest="dryrun", action="store_true", help="Don't really do anything", default=False) + parser.add_option("--verbose", "-v", dest="verbose", action="store_true", help="Print some commentary", default=False) + parser.add_option("--nodb", dest="nodb", help="Don't create a 'personal' database", action="store_true", default=False) + + (opts, args) = parser.parse_args() + + c = pymongo.MongoClient(get_mongodb_uri(opts.host)) + credentials = get_auth(opts.host, 'admin', 'admin') + if not credentials: + print >>sys.stderr, "You have no admin credentials for instance '%s'" % (opts.host) + return + + if not authenticate(c.admin, credentials.user, credentials.password): + print >>sys.stderr, "Failed to authenticate to '%s' as '%s'" % (opts.host, credentials.user) + return + + for user in args: + + p = opts.password + + if p is None: + p = base64.b64encode(uuid.uuid4().bytes).replace('/', '')[:12] + + if not opts.dryrun: + if opts.verbose: + print "Adding user %s to DB %s" % (user, opts.host) + if not opts.nodb: + if opts.verbose: + print "Adding database arctic_%s to DB %s" % (user, opts.host) + c['arctic_' + user].add_user(user, p) + c.admin.add_user(user, p, read_only=opts.admin) + else: + print "DRYRUN: add user %s readonly %s nodb %s" % (user, opts.admin, opts.nodb) + + if not opts.password: + print "%-16s %s" % (user, p) + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_delete_library.py b/arctic/scripts/arctic_delete_library.py new file mode 100644 index 000000000..449efc689 --- /dev/null +++ b/arctic/scripts/arctic_delete_library.py @@ -0,0 +1,40 @@ +import optparse +import pymongo + +from ..logging import logger +from ..hooks import get_mongodb_uri +from ..arctic import Arctic +from .utils import do_db_auth + + +def main(): + usage = """usage: %prog [options] + + Deletes the named library from a user's database. + + Example: + %prog --host=hostname --library=arctic_jblackburn.my_library + """ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_option("--library", help="The name of the library. e.g. 'arctic_jblackburn.lib'") + + (opts, _) = parser.parse_args() + + if not opts.library: + parser.error('Must specify the full path of the library e.g. arctic_jblackburn.lib!') + + print "Deleting: %s on mongo %s" % (opts.library, opts.host) + c = pymongo.MongoClient(get_mongodb_uri(opts.host)) + + db_name = opts.library[:opts.library.index('.')] if '.' in opts.library else None + do_db_auth(opts.host, c, db_name) + store = Arctic(c) + store.delete_library(opts.library) + + logger.info("Library %s deleted" % opts.library) + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_enable_sharding.py b/arctic/scripts/arctic_enable_sharding.py new file mode 100644 index 000000000..7ad79ee67 --- /dev/null +++ b/arctic/scripts/arctic_enable_sharding.py @@ -0,0 +1,37 @@ +import optparse +import pymongo + +from ..arctic import Arctic +from ..auth import get_auth +from ..hooks import get_mongodb_uri +from .._util import enable_sharding +from ..auth import authenticate + + +def main(): + usage = """usage: %prog [options] arg1=value, arg2=value + + Enables sharding on the specified arctic library. + """ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_option("--library", help="The name of the library. e.g. 'arctic_jblackburn.lib'") + + (opts, _) = parser.parse_args() + + if not opts.library or '.' not in opts.library: + parser.error('must specify the full path of the library e.g. arctic_jblackburn.lib!') + + print "Enabling-sharding: %s on mongo %s" % (opts.library, opts.host) + + c = pymongo.MongoClient(get_mongodb_uri(opts.host)) + credentials = get_auth(opts.host, 'admin', 'admin') + if credentials: + authenticate(c.admin, credentials.user, credentials.password) + store = Arctic(c) + enable_sharding(store, opts.library) + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_fsck.py b/arctic/scripts/arctic_fsck.py new file mode 100644 index 000000000..016f17611 --- /dev/null +++ b/arctic/scripts/arctic_fsck.py @@ -0,0 +1,72 @@ +import logging +import argparse + +from ..logging import logger +from ..hooks import get_mongodb_uri +from ..arctic import Arctic, ArcticLibraryBinding +from .utils import do_db_auth + + +def main(): + usage = """ + Check a Arctic Library for inconsistencies. + """ + + parser = argparse.ArgumentParser(usage=usage) + parser.add_argument("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_argument("--library", nargs='+', required=True, help="The name of the library. e.g. 'arctic_jblackburn.lib'") + parser.add_argument("-v", action='store_true', help="Verbose mode") + parser.add_argument("-f", action='store_true', help="Force ; Cleanup any problems found. (Default is dry-run.)") + parser.add_argument("-n", action='store_true', help="No FSCK ; just print stats.)") + + opts = parser.parse_args() + + if opts.v: + logger.setLevel(logging.DEBUG) + + if not opts.f: + logger.info("DRY-RUN: No changes will be made.") + + logger.info("FSCK'ing: %s on mongo %s" % (opts.library, opts.host)) + store = Arctic(get_mongodb_uri(opts.host)) + + for lib in opts.library: + # Auth to the DB for making changes + if opts.f: + database_name, _ = ArcticLibraryBinding._parse_db_lib(lib) + do_db_auth(opts.host, store._conn, database_name) + + orig_stats = store[lib].stats() + + logger.info('----------------------------') + if not opts.n: + store[lib]._fsck(not opts.f) + logger.info('----------------------------') + + final_stats = store[lib].stats() + logger.info('Stats:') + logger.info('Sharded: %s' % final_stats['chunks'].get('sharded', False)) + logger.info('Symbols: %10d' % len(store[lib].list_symbols())) + logger.info('Versions: %10d Change(+/-) %6d (av: %.2fMB)' % + (final_stats['versions']['count'], + final_stats['versions']['count'] - orig_stats['versions']['count'], + final_stats['versions'].get('avgObjSize', 0) / 1024. / 1024.)) + logger.info("Versions: %10.2fMB Change(+/-) %.2fMB" % + (final_stats['versions']['size'] / 1024. / 1024., + (final_stats['versions']['size'] - orig_stats['versions']['size']) / 1024. / 1024.)) + logger.info('Chunk Count: %7d Change(+/-) %6d (av: %.2fMB)' % + (final_stats['chunks']['count'], + final_stats['chunks']['count'] - orig_stats['chunks']['count'], + final_stats['chunks'].get('avgObjSize', 0) / 1024. / 1024.)) + logger.info("Chunks: %12.2fMB Change(+/-) %6.2fMB" % + (final_stats['chunks']['size'] / 1024. / 1024., + (final_stats['chunks']['size'] - orig_stats['chunks']['size']) / 1024. / 1024.)) + logger.info('----------------------------') + + if not opts.f: + logger.info("Done: DRY-RUN: No changes made. (Use -f to fix any problems)") + else: + logger.info("Done.") + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_init_library.py b/arctic/scripts/arctic_init_library.py new file mode 100644 index 000000000..a6ca3685e --- /dev/null +++ b/arctic/scripts/arctic_init_library.py @@ -0,0 +1,53 @@ +import argparse +import pymongo + +from ..logging import logger +from ..hooks import get_mongodb_uri +from ..arctic import Arctic, VERSION_STORE, LIBRARY_TYPES, \ + ArcticLibraryBinding +from .utils import do_db_auth + + +def main(): + usage = """Initializes a named library in a user's database. Note that it will enable sharding on the underlying + collection if it can. To do this you must have admin credentials in arctic: + + Example: + arctic_init_library --host=hostname --library=arctic_jblackburn.my_library + """ + + parser = argparse.ArgumentParser(usage=usage) + parser.add_argument("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_argument("--library", help="The name of the library. e.g. 'arctic_jblackburn.lib'") + parser.add_argument("--type", default=VERSION_STORE, choices=sorted(LIBRARY_TYPES.keys()), + help="The type of the library, as defined in " + "arctic.py. Default: %s" % VERSION_STORE) + parser.add_argument("--quota", default=10, help="Quota for the library in GB. A quota of 0 is unlimited." + "Default: 10") + parser.add_argument("--hashed", action="store_true", default=False, help="Use hashed based sharding. Useful where SYMBOLs share a common prefix (e.g. Bloomberg BBGXXXX symbols)" + "Default: False") + + opts = parser.parse_args() + + if not opts.library or '.' not in opts.library \ + or not opts.library.startswith('arctic'): + parser.error('Must specify the full path of the library e.g. arctic_jblackburn.library!') + db_name, _ = ArcticLibraryBinding._parse_db_lib(opts.library) + + print "Initializing: %s on mongo %s" % (opts.library, opts.host) + c = pymongo.MongoClient(get_mongodb_uri(opts.host)) + + if not do_db_auth(opts.host, c, db_name): + logger.error('Authentication Failed. Exiting.') + return + + store = Arctic(c) + store.initialize_library("%s" % opts.library, opts.type, hashed=opts.hashed) + logger.info("Library %s created" % opts.library) + + logger.info("Setting quota to %sG" % opts.quota) + store.set_quota(opts.library, int(opts.quota) * 1024 * 1024 * 1024) + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_list_libraries.py b/arctic/scripts/arctic_list_libraries.py new file mode 100644 index 000000000..bb9eca69a --- /dev/null +++ b/arctic/scripts/arctic_list_libraries.py @@ -0,0 +1,31 @@ +from __future__ import print_function +import optparse + +from ..arctic import Arctic + +print = print + + +def main(): + usage = """usage: %prog [options] [prefix ...] + + Lists the libraries available in a user's database. If any prefix parameters + are given, list only libraries with names that start with one of the prefixes. + + Example: + %prog --host=hostname rgautier + """ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + + (opts, args) = parser.parse_args() + + store = Arctic(opts.host) + for name in sorted(store.list_libraries()): + if (not args) or [n for n in args if name.startswith(n)]: + print(name) + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/arctic_prune_versions.py b/arctic/scripts/arctic_prune_versions.py new file mode 100644 index 000000000..8ef01e3e2 --- /dev/null +++ b/arctic/scripts/arctic_prune_versions.py @@ -0,0 +1,59 @@ +import optparse +import pymongo + +from ..logging import logger +from ..hooks import get_mongodb_uri +from ..arctic import Arctic, ArcticLibraryBinding +from .utils import do_db_auth + + +def prune_versions(lib, symbol, keep_mins): + lib._prune_previous_versions(symbol, keep_mins=keep_mins) + + +def main(): + usage = """usage: %prog [options] + + Prunes (i.e. deletes) versions of data that are not the most recent, and are older than 10 minutes, + and are not in use by snapshots. Must be used on a Arctic VersionStore library instance. + + Example: + arctic_prune_versions --host=hostname --library=arctic_jblackburn.my_library + """ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--host", default='localhost', help="Hostname, or clustername. Default: localhost") + parser.add_option("--library", help="The name of the library. e.g. 'arctic_jblackburn.library'") + parser.add_option("--symbols", help="The symbols to prune - comma separated (default all)") + parser.add_option("--keep-mins", default=10, help="Ensure there's a version at least keep-mins old. Default:10") + + + (opts, _) = parser.parse_args() + + if not opts.library: + parser.error('Must specify the Arctic library e.g. arctic_jblackburn.library!') + db_name, _ = ArcticLibraryBinding._parse_db_lib(opts.library) + + print "Pruning (old) versions in : %s on mongo %s" % (opts.library, opts.host) + print "Keeping all versions <= %s mins old" % (opts.keep_mins) + c = pymongo.MongoClient(get_mongodb_uri(opts.host)) + + if not do_db_auth(opts.host, c, db_name): + logger.error('Authentication Failed. Exiting.') + return + lib = Arctic(c)[opts.library] + + if opts.symbols: + symbols = opts.symbols.split(',') + else: + symbols = lib.list_symbols(all_symbols=True) + logger.info("Found %s symbols" % len(symbols)) + + for s in symbols: + logger.info("Pruning %s" % s) + prune_versions(lib, s, opts.keep_mins) + logger.info("Done") + + +if __name__ == '__main__': + main() diff --git a/arctic/scripts/utils.py b/arctic/scripts/utils.py new file mode 100644 index 000000000..9b34c8451 --- /dev/null +++ b/arctic/scripts/utils.py @@ -0,0 +1,34 @@ +from ..logging import logger +from ..auth import get_auth, authenticate + + +def do_db_auth(host, connection, db_name): + """ + Attempts to authenticate against the mongo instance. + + Tries: + - Auth'ing against admin as 'admin' ; credentials: /arctic/admin/admin + - Auth'ing against db_name (which may be None if auth'ing against admin above) + + returns True if authentication succeeded. + """ + admin_creds = get_auth(host, 'admin', 'admin') + user_creds = get_auth(host, 'arctic', db_name) + + # Attempt to authenticate the connection + # Try at 'admin level' first as this allows us to enableSharding, which we want + if admin_creds is None: + # Get ordinary credentials for authenticating against the DB + if user_creds is None: + logger.error("You need credentials for db '%s' on '%s', or admin credentials" % (db_name, host)) + return False + if not authenticate(connection[db_name], user_creds.user, user_creds.password): + logger.error("Failed to authenticate to db '%s' on '%s', using user credentials" % (db_name, host)) + return False + return True + elif not authenticate(connection.admin, admin_creds.user, admin_creds.password): + logger.error("Failed to authenticate to '%s' as Admin. Giving up." % (host)) + return False + # Ensure we attempt to auth against the user DB, for non-priviledged users to get access + authenticate(connection[db_name], user_creds.user, user_creds.password) + return True diff --git a/arctic/store/__init__.py b/arctic/store/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/arctic/store/_ndarray_store.py b/arctic/store/_ndarray_store.py new file mode 100644 index 000000000..8642ec9ad --- /dev/null +++ b/arctic/store/_ndarray_store.py @@ -0,0 +1,393 @@ +from bson.binary import Binary +import hashlib +import numpy as np +import pprint +from pymongo import ReadPreference +import pymongo +from pymongo.errors import OperationFailure, DuplicateKeyError + +from ..logging import logger +from ..decorators import mongo_retry, dump_bad_documents +from ..exceptions import UnhandledDtypeException +from ._version_store_utils import checksum + +from .._compression import compress_array, decompress +from ..exceptions import ConcurrentModificationException + +_CHUNK_SIZE = 2 * 1024 * 1024 - 2048 # ~2 MB (a bit less for usePowerOf2Sizes) +_APPEND_SIZE = 1 * 1024 * 1024 # 1MB +_APPEND_COUNT = 60 # 1 hour of 1 min data + + +def _promote_struct_dtypes(dtype1, dtype2): + if not set(dtype1.names).issuperset(set(dtype2.names)): + raise Exception("Removing columns from dtype not handled") + + def _promote(type1, type2): + if type2 is None: + return type1 + if type1.shape is not None: + if not type1.shape == type2.shape: + raise Exception("We do not handle changes to dtypes that have shape") + return np.promote_types(type1.base, type2.base), type1.shape + return np.promote_types(type1, type2) + return np.dtype([(n, _promote(dtype1.fields[n][0], dtype2.fields.get(n, (None,))[0])) for n in dtype1.names]) + + +class NdarrayStore(object): + """Chunked store for arbitrary ndarrays, supporting append.""" + TYPE = 'ndarray' + + @classmethod + def initialize_library(cls, *args, **kwargs): + pass + + @staticmethod + def _ensure_index(collection): + try: + collection.create_index([('symbol', pymongo.HASHED)], background=True) + collection.create_index([('symbol', pymongo.ASCENDING), + ('sha', pymongo.ASCENDING)], unique=True, background=True) + collection.create_index([('symbol', pymongo.ASCENDING), + ('parent', pymongo.ASCENDING), + ('segment', pymongo.ASCENDING)], unique=True, background=True) + except OperationFailure, e: + if "can't use unique indexes" in str(e): + return + raise + + + @mongo_retry + def can_delete(self, version, symbol): + return self.can_read(version, symbol) + + def can_read(self, version, symbol): + return version['type'] == self.TYPE + + def can_write(self, version, symbol, data): + return isinstance(data, np.ndarray) and not data.dtype.hasobject + + def _dtype(self, string, metadata=None): + if metadata is None: + metadata = {} + if string.startswith('['): + return np.dtype(eval(string), metadata=metadata) + return np.dtype(string, metadata=metadata) + + + def _index_range(self, version, symbol, from_version=None, **kwargs): + """ + Tuple describing range to read from the ndarray - closed:open + """ + from_index = None + if from_version: + if version['base_sha'] != from_version['base_sha']: + #give up - the data has been overwritten, so we can't tail this + raise ConcurrentModificationException("Concurrent modification - data has been overwritten") + from_index = from_version['up_to'] + return from_index, None + + def get_info(self, arctic_lib, version, symbol, **kwargs): + collection = arctic_lib.get_top_level_collection() + dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {})) + length = int(version['up_to']) + + spec = {'symbol': symbol, + 'parent': version.get('base_version_id', version['_id']), + 'segment': {'$lt': length}} + + n_segments = collection.find(spec).count() + + est_size = dtype.itemsize * length + return """Handler: %s + +dtype: %s + +%d rows in %d segments +Data size: %s bytes + +Version document: +%s""" % (self.__class__.__name__, dtype, length, n_segments, est_size, pprint.pformat(version)) + + + def read(self, arctic_lib, version, symbol, read_preference=None, **kwargs): + index_range = self._index_range(version, symbol, **kwargs) + collection = arctic_lib.get_top_level_collection() + if read_preference: + collection = collection.with_options(read_preference=read_preference) + return self._do_read(collection, version, symbol, index_range=index_range) + + def _do_read(self, collection, version, symbol, index_range=None): + from_index = index_range[0] if index_range else None + to_index = index_range[1] if index_range and index_range[1] is not None \ + and index_range[1] < version['up_to'] else version['up_to'] + segment_count = None + + spec = {'symbol': symbol, + 'parent': version.get('base_version_id', version['_id']), + 'segment': {'$lt': to_index}} + if from_index: + spec['segment'] = {'$lt': version['up_to'], '$gte': from_index} + else: + segment_count = version.get('segment_count', None) + + segments = [] + i = -1 + for i, x in enumerate(collection.find(spec, sort=[('segment', pymongo.ASCENDING)],)): + try: + segments.append(decompress(x['data']) if x['compressed'] else x['data']) + except Exception: + dump_bad_documents(x, collection.find_one({'_id': x['_id']}), + collection.find_one({'_id': x['_id']}), + collection.find_one({'_id': x['_id']})) + raise + data = ''.join(segments) + + # Check that the correct number of segments has been returned + if segment_count is not None and i + 1 != segment_count: + raise OperationFailure("Incorrect number of segments returned for {}:{}. Expected: {}, but got {}. {}".format( + symbol, version['version'], segment_count, i + 1, collection.database.name + '.' + collection.name)) + + dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {})) + rtn = np.fromstring(data, dtype=dtype).reshape(version.get('shape', (-1))) + return rtn + + def _promote_types(self, item, dtype_str): + if dtype_str == str(item.dtype): + return item.dtype + prev_dtype = self._dtype(dtype_str) + if item.dtype.names is None: + rtn = np.promote_types(item.dtype, prev_dtype) + else: + rtn = _promote_struct_dtypes(item.dtype, prev_dtype) + rtn = np.dtype(rtn, metadata=dict(item.dtype.metadata or {})) + return rtn + + def append(self, arctic_lib, version, symbol, item, previous_version): + collection = arctic_lib.get_top_level_collection() + if previous_version.get('shape', [-1]) != [-1, ] + list(item.shape)[1:]: + raise UnhandledDtypeException() + + if previous_version['up_to'] == 0: + dtype = item.dtype + elif len(item) == 0: + dtype = self._dtype(previous_version['dtype']) + else: + dtype = self._promote_types(item, previous_version['dtype']) + item = item.astype(dtype) + if str(dtype) != previous_version['dtype']: + logger.debug('Converting %s from %s to %s' % (symbol, previous_version['dtype'], str(dtype))) + if item.dtype.hasobject: + raise UnhandledDtypeException() + version['dtype'] = str(item.dtype) + version['dtype_metadata'] = dict(item.dtype.metadata or {}) + version['type'] = self.TYPE + + old_arr = self._do_read(collection, previous_version, symbol).astype(dtype) + # missing float columns should default to nan rather than zero + old_dtype = self._dtype(previous_version['dtype']) + if dtype.names is not None and old_dtype.names is not None: + new_columns = set(dtype.names) - set(old_dtype.names) + _is_float_type = lambda _dtype: _dtype.type in (np.float32, np.float64) + _is_void_float_type = lambda _dtype: _dtype.type == np.void and _is_float_type(_dtype.subdtype[0]) + _is_float_or_void_float_type = lambda _dtype: _is_float_type(_dtype) or _is_void_float_type(_dtype) + _is_float = lambda column: _is_float_or_void_float_type(dtype.fields[column][0]) + for new_column in filter(_is_float, new_columns): + old_arr[new_column] = np.nan + + item = np.concatenate([old_arr, item]) + version['up_to'] = len(item) + version['sha'] = self.checksum(item) + version['base_sha'] = version['sha'] + self._do_write(collection, version, symbol, item, previous_version) + else: + version['dtype'] = previous_version['dtype'] + version['dtype_metadata'] = previous_version['dtype_metadata'] + version['type'] = self.TYPE + self._do_append(collection, version, symbol, item, previous_version) + + def _do_append(self, collection, version, symbol, item, previous_version): + + data = item.tostring() + version['base_sha'] = previous_version['base_sha'] + version['up_to'] = previous_version['up_to'] + len(item) + if len(item) > 0: + version['segment_count'] = previous_version['segment_count'] + 1 + version['append_count'] = previous_version['append_count'] + 1 + version['append_size'] = previous_version['append_size'] + len(data) + else: + version['segment_count'] = previous_version['segment_count'] + version['append_count'] = previous_version['append_count'] + version['append_size'] = previous_version['append_size'] + + #_CHUNK_SIZE is probably too big if we're only appending single rows of data - perhaps something smaller, + #or also look at number of appended segments? + if version['append_count'] < _APPEND_COUNT and version['append_size'] < _APPEND_SIZE: + version['base_version_id'] = previous_version.get('base_version_id', previous_version['_id']) + + if len(item) > 0: + + segment = {'data': Binary(data), 'compressed': False} + segment['segment'] = version['up_to'] - 1 + try: + collection.update_one({'symbol': symbol, + 'sha': checksum(symbol, segment)}, + {'$set': segment, + '$addToSet': {'parent': version['base_version_id']}}, + upsert=True) + except DuplicateKeyError: + '''If we get a duplicate key error here, this segment has the same symbol/parent/segment + as another chunk, but a different sha. This means that we have 'forked' history. + If we concat_and_rewrite here, new chunks will have a different parent id (the _id of this version doc) + ...so we can safely write them. + ''' + self._concat_and_rewrite(collection, version, symbol, item, previous_version) + return + + if 'segment_index' in previous_version: + segment_index = self._segment_index(item, + existing_index=previous_version.get('segment_index'), + start=previous_version['up_to'], + new_segments=[segment['segment'], ]) + if segment_index: + version['segment_index'] = segment_index + logger.debug("Appended segment %d for parent %s" % (segment['segment'], version['_id'])) + else: + if 'segment_index' in previous_version: + version['segment_index'] = previous_version['segment_index'] + + else: # Too much data has been appended now, so rewrite (and compress/chunk). + self._concat_and_rewrite(collection, version, symbol, item, previous_version) + + def _concat_and_rewrite(self, collection, version, symbol, item, previous_version): + + version.pop('base_version_id', None) + + # Figure out which is the last 'full' chunk + spec = {'symbol': symbol, + 'parent': previous_version.get('base_version_id', previous_version['_id']), + 'segment': {'$lt': version['up_to']}} + + read_index_range = [0, None] + unchanged_segment_ids = list(collection.find(spec, projection={'_id':1, 'segment':1}, + sort=[('segment', pymongo.ASCENDING)],))\ + [:-1 * (previous_version['append_count'] + 1)] + if unchanged_segment_ids: + read_index_range[0] = unchanged_segment_ids[-1]['segment'] + 1 + + old_arr = self._do_read(collection, previous_version, symbol, index_range=read_index_range) + if len(item) == 0: + logger.debug('Rewrite and compress/chunk item %s, rewrote old_arr' % symbol) + self._do_write(collection, version, symbol, old_arr, previous_version, segment_offset=read_index_range[0]) + elif len(old_arr) == 0: + logger.debug('Rewrite and compress/chunk item %s, wrote item' % symbol) + self._do_write(collection, version, symbol, item, previous_version, segment_offset=read_index_range[0]) + else: + logger.debug("Rewrite and compress/chunk %s, np.concatenate %s to %s" % (symbol, + item.dtype, old_arr.dtype)) + self._do_write(collection, version, symbol, np.concatenate([old_arr, item]), previous_version, segment_offset=read_index_range[0]) + if unchanged_segment_ids: + collection.update_many({'symbol': symbol, '_id': {'$in': [x['_id'] for x in unchanged_segment_ids]}}, + {'$addToSet': {'parent': version['_id']}}) + version['segment_count'] = version['segment_count'] + len(unchanged_segment_ids) + + def check_written(self, collection, symbol, version): + # Check all the chunks are in place + seen_chunks = collection.find({'symbol': symbol, 'parent': version['_id']}, + ).count() + + if seen_chunks != version['segment_count']: + segments = [x['segment'] for x in collection.find({'symbol': symbol, 'parent': version['_id']}, + projection={'segment': 1}, + )] + raise pymongo.errors.OperationFailure("Failed to write all the Chunks. Saw %s expecting %s" + "Parent: %s \n segments: %s" % + (seen_chunks, version['segment_count'], version['_id'], segments)) + + def checksum(self, item): + sha = hashlib.sha1() + sha.update(item.tostring()) + return Binary(sha.digest()) + + def write(self, arctic_lib, version, symbol, item, previous_version): + collection = arctic_lib.get_top_level_collection() + if item.dtype.hasobject: + raise UnhandledDtypeException() + + version['dtype'] = str(item.dtype) + version['shape'] = (-1,) + item.shape[1:] + version['dtype_metadata'] = dict(item.dtype.metadata or {}) + version['type'] = self.TYPE + version['up_to'] = len(item) + version['sha'] = self.checksum(item) + + if previous_version: + if version['dtype'] == str(item.dtype) \ + and 'sha' in previous_version \ + and self.checksum(item[:previous_version['up_to']]) == previous_version['sha']: + #The first n rows are identical to the previous version, so just append. + self._do_append(collection, version, symbol, item[previous_version['up_to']:], previous_version) + return + + version['base_sha'] = version['sha'] + self._do_write(collection, version, symbol, item, previous_version) + + def _do_write(self, collection, version, symbol, item, previous_version, segment_offset=0): + + sze = int(item.dtype.itemsize * np.prod(item.shape[1:])) + + # chunk and store the data by (uncompressed) size + chunk_size = _CHUNK_SIZE / sze + + previous_shas = [] + if previous_version: + previous_shas = set([x['sha'] for x in + collection.find({'symbol': symbol}, + projection={'sha': 1, '_id': 0}, + ) + ]) + + length = len(item) + + if segment_offset > 0 and 'segment_index' in previous_version: + existing_index = previous_version['segment_index'] + else: + existing_index = None + + segment_index = [] + i = -1 + + # Compress + idxs = xrange(int(np.ceil(float(length) / chunk_size))) + chunks = [(item[i * chunk_size: (i + 1) * chunk_size]).tostring() for i in idxs] + compressed_chunks = compress_array(chunks) + + # Write + bulk = collection.initialize_unordered_bulk_op() + for i, chunk in zip(idxs, compressed_chunks): + segment = {'data': Binary(chunk), 'compressed':True} + segment['segment'] = min((i + 1) * chunk_size - 1, length - 1) + segment_offset + segment_index.append(segment['segment']) + sha = checksum(symbol, segment) + if sha not in previous_shas: + segment['sha'] = sha + bulk.find({'symbol': symbol, 'sha': sha, 'segment': segment['segment']} + ).upsert().update_one({'$set': segment, '$addToSet': {'parent': version['_id']}}) + else: + bulk.find({'symbol': symbol, 'sha': sha, 'segment': segment['segment']} + ).update_one({'$addToSet': {'parent': version['_id']}}) + if i != -1: + bulk.execute() + + segment_index = self._segment_index(item, existing_index=existing_index, start=segment_offset, new_segments=segment_index) + if segment_index: + version['segment_index'] = segment_index + version['segment_count'] = i + 1 + version['append_size'] = 0 + version['append_count'] = 0 + + self.check_written(collection, symbol, version) + + def _segment_index(self, item, existing_index, start, new_segments): + pass + diff --git a/arctic/store/_pandas_ndarray_store.py b/arctic/store/_pandas_ndarray_store.py new file mode 100644 index 000000000..50296ec17 --- /dev/null +++ b/arctic/store/_pandas_ndarray_store.py @@ -0,0 +1,209 @@ +from _ndarray_store import NdarrayStore +from pandas import DataFrame, MultiIndex, Series, DatetimeIndex, Panel +from pandas.tslib import Timestamp, get_timezone +import numpy as np + +from ..logging import logger as log + + +def _to_primitive(arr): + if arr.dtype.hasobject: + if len(arr) > 0: + if isinstance(arr[0], Timestamp): + return arr.astype('datetime64[ns]') + return np.array(list(arr)) + return arr + + +class PandasStore(NdarrayStore): + + def _index_to_records(self, df): + metadata = {} + index = df.index + + if isinstance(index, MultiIndex): + # array of tuples to numpy cols. copy copy copy + if len(df) > 0: + ix_vals = map(np.array, zip(*index.values)) + else: + # empty multi index has no size, create empty arrays for recarry.. + ix_vals = [np.array([]) for n in index.names] + else: + ix_vals = [index.values] + + count = 0 + index_names = list(index.names) + if isinstance(index, MultiIndex): + for i, n in enumerate(index_names): + if n is None: + index_names[i] = 'level_%d' % count + count += 1 + elif index_names[0] is None: + index_names = ['index'] + + metadata['index'] = index_names + + if isinstance(index, DatetimeIndex) and index.tz is not None: + metadata['index_tz'] = get_timezone(index.tz) + + return index_names, ix_vals, metadata + + def _index_from_records(self, recarr): + index = recarr.dtype.metadata['index'] + rtn = MultiIndex.from_arrays([recarr[str(i)] for i in index], names=index) + + if isinstance(rtn, DatetimeIndex) and 'index_tz' in recarr.dtype.metadata: + rtn = rtn.tz_localize('UTC').tz_convert(recarr.dtype.metadata['index_tz']) + + return rtn + + def to_records(self, df): + """ + Similar to DataFrame.to_records() + Differences: + Attempt type conversion for pandas columns stored as objects (e.g. strings), + as we can only store primitives in the ndarray. + Use dtype metadata to store column and index names. + """ + + index_names, ix_vals, metadata = self._index_to_records(df) + columns, column_vals = self._column_data(df) + + metadata['columns'] = columns + names = index_names + columns + arrays = ix_vals + column_vals + arrays = map(_to_primitive, arrays) + dtype = np.dtype([(str(x), v.dtype) if len(v.shape) == 1 else (str(x), v.dtype, v.shape[1]) for x, v in zip(names, arrays)], + metadata=metadata) + rtn = np.rec.fromarrays(arrays, dtype=dtype, names=names) + #For some reason the dtype metadata is lost in the line above. + rtn.dtype = dtype + return rtn + + def can_convert_to_records_without_objects(self, df, symbol): + # We can't easily distinguish string columns from objects + try: + arr = self.to_records(df) + except Exception as e: + # This exception will also occur when we try to write the object so we fall-back to saving using Pickle + log.info('Pandas dataframe %s caused exception "%s" when attempting to convert to records. Saving as Blob.' + % (symbol, repr(e))) + return False + else: + if arr.dtype.hasobject: + log.info('Pandas dataframe %s contains Objects, saving as Blob' % symbol) + # Will fall-back to saving using Pickle + return False + elif any([len(x[0].shape) for x in arr.dtype.fields.values()]): + log.info('Pandas dataframe %s contains >1 dimensional arrays, saving as Blob' % symbol) + return False + else: + return True + + +class PandasSeriesStore(PandasStore): + TYPE = 'pandasseries' + + def _column_data(self, s): + columns = [s.name if s.name else 'values'] + column_vals = [s.values] + return columns, column_vals + + def from_records(self, recarr): + index = self._index_from_records(recarr) + name = recarr.dtype.names[-1] + return Series.from_array(recarr[name], index=index, name=name) + + def can_write(self, version, symbol, data): + if isinstance(data, Series): + if data.dtype == np.object_ or data.index.dtype == np.object_: + return self.can_convert_to_records_without_objects(data, symbol) + return True + return False + + def write(self, arctic_lib, version, symbol, item, previous_version): + item = self.to_records(item) + super(PandasSeriesStore, self).write(arctic_lib, version, symbol, item, previous_version) + + def append(self, arctic_lib, version, symbol, item, previous_version): + item = self.to_records(item) + super(PandasSeriesStore, self).append(arctic_lib, version, symbol, item, previous_version) + + def read(self, arctic_lib, version, symbol, **kwargs): + item = super(PandasSeriesStore, self).read(arctic_lib, version, symbol, **kwargs) + return self.from_records(item) + + +class PandasDataFrameStore(PandasStore): + TYPE = 'pandasdf' + + def _column_data(self, df): + columns = list(map(str, df.columns)) + column_vals = [df[c].values for c in df.columns] + return columns, column_vals + + + def from_records(self, recarr): + index = self._index_from_records(recarr) + column_fields = [x for x in recarr.dtype.names if x not in recarr.dtype.metadata['index']] + if len(recarr) == 0: + rdata = recarr[column_fields] if len(column_fields) > 0 else None + return DataFrame(rdata, index=index) + + columns = recarr.dtype.metadata['columns'] + return DataFrame(data=recarr[column_fields], index=index, columns=columns) + + def can_write(self, version, symbol, data): + if isinstance(data, DataFrame): + if np.any(data.dtypes.values == 'object'): + return self.can_convert_to_records_without_objects(data, symbol) + return True + return False + + def write(self, arctic_lib, version, symbol, item, previous_version): + item = self.to_records(item) + super(PandasDataFrameStore, self).write(arctic_lib, version, symbol, item, previous_version) + + def append(self, arctic_lib, version, symbol, item, previous_version): + item = self.to_records(item) + super(PandasDataFrameStore, self).append(arctic_lib, version, symbol, item, previous_version) + + def read(self, arctic_lib, version, symbol, **kwargs): + item = super(PandasDataFrameStore, self).read(arctic_lib, version, symbol, **kwargs) + return self.from_records(item) + +class PandasPanelStore(PandasDataFrameStore): + TYPE = 'pandaspan' + + def can_write(self, version, symbol, data): + if isinstance(data, Panel): + frame = data.to_frame() + if np.any(frame.dtypes.values == 'object'): + return self.can_convert_to_records_without_objects(frame, symbol) + return True + return False + + def write(self, arctic_lib, version, symbol, item, previous_version): + if np.product(item.shape) == 0: + # Currently not supporting zero size panels as they drop indices when converting to dataframes + # Plan is to find a better solution in due course. + raise ValueError('Cannot insert a zero size panel into mongo.') + if not np.all(len(i.names) == 1 for i in item.axes): + raise ValueError('Cannot insert panels with multiindexes') + item = item.to_frame() + if len(set(item.dtypes)) == 1: + # If all columns have the same dtype, we support non-string column names. + # We know from above check that columns is not a multiindex. + item = DataFrame(item.stack()) + elif item.columns.dtype != np.dtype('object'): + raise ValueError('Cannot support non-object dtypes for columns') + super(PandasPanelStore, self).write(arctic_lib, version, symbol, item, previous_version) + + def read(self, arctic_lib, version, symbol, **kwargs): + item = super(PandasPanelStore, self).read(arctic_lib, version, symbol, **kwargs) + if len(item.index.names) == 3: + return item.iloc[:, 0].unstack().to_panel() + return item.to_panel() + + def append(self, arctic_lib, version, symbol, item, previous_version): + raise ValueError('Appending not supported for pandas.Panel') diff --git a/arctic/store/_pickle_store.py b/arctic/store/_pickle_store.py new file mode 100644 index 000000000..80e0717b5 --- /dev/null +++ b/arctic/store/_pickle_store.py @@ -0,0 +1,62 @@ +import bson +from bson.binary import Binary +from bson.errors import InvalidDocument +import cPickle +import lz4 +import pymongo +import pprint + +from ._version_store_utils import checksum + +_MAGIC_CHUNKED = '__chunked__' +_CHUNK_SIZE = 15 * 1024 * 1024 # 15MB + + +class PickleStore(object): + + @classmethod + def initialize_library(cls, *args, **kwargs): + pass + + def get_info(self, arctic_lib, version, symbol, **kwargs): + if 'blob' in version: + if version['blob'] != _MAGIC_CHUNKED: + version['blob'] = "" + + return """Handler: %s\n\nVersion document:\n%s""" % (self.__class__.__name__, pprint.pformat(version)) + + def read(self, arctic_lib, version, symbol, **kwargs): + if 'blob' in version: + if version['blob'] == _MAGIC_CHUNKED: + collection = arctic_lib.get_top_level_collection() + data = ''.join([x['data'] for x in collection.find({'symbol': symbol, + 'parent': version['_id']}, + sort=[('segment', pymongo.ASCENDING)])]) + else: + data = version['blob'] + # Backwards compatibility + return cPickle.loads(lz4.decompress(data)) + return version['data'] + + def write(self, arctic_lib, version, symbol, item, previous_version): + try: + # If it's encodeable, then ship it + bson.BSON.encode({'data': item}) + version['data'] = item + return + except InvalidDocument: + pass + + # Pickle, chunk and store the data + collection = arctic_lib.get_top_level_collection() + # Try to pickle it. This is best effort + version['blob'] = _MAGIC_CHUNKED + pickled = lz4.compressHC(cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL)) + + for i in xrange(len(pickled) / _CHUNK_SIZE + 1): + segment = {'data': Binary(pickled[i * _CHUNK_SIZE : (i + 1) * _CHUNK_SIZE])} + sha = checksum(symbol, segment) + segment['segment'] = i + collection.update_one({'symbol': symbol, 'sha': sha}, {'$set': segment, + '$addToSet': {'parent': version['_id']}}, + upsert=True) diff --git a/arctic/store/_version_store_utils.py b/arctic/store/_version_store_utils.py new file mode 100644 index 000000000..5ea572958 --- /dev/null +++ b/arctic/store/_version_store_utils.py @@ -0,0 +1,55 @@ +from bson import Binary +import hashlib +import numpy as np + +def _split_arrs(array_2d, slices): + """ + Equivalent to numpy.split(array_2d, slices), + but avoids fancy indexing + """ + if len(array_2d) == 0: + return np.empty(0, dtype=np.object) + + rtn = np.empty(len(slices) + 1, dtype=np.object) + start = 0 + for i, s in enumerate(slices): + rtn[i] = array_2d[start:s] + start = s + rtn[-1] = array_2d[start:] + return rtn + + +def checksum(symbol, doc): + """ + Checksum the passed in dictionary + """ + sha = hashlib.sha1() + sha.update(symbol) + for k in sorted(doc.iterkeys(), reverse=True): + sha.update(str(doc[k])) + return Binary(sha.digest()) + + +def cleanup(arctic_lib, symbol, version_ids): + """ + Helper method for cleaning up chunks from a version store + """ + collection = arctic_lib.get_top_level_collection() + + # Remove any chunks which contain just the parents, at the outset + # We do this here, because $pullALL will make an empty array: [] + # and the index which contains the parents field will fail the unique constraint. + for v in version_ids: + # Remove all documents which only contain the parent + collection.delete_many({'symbol': symbol, + 'parent': {'$all': [v], + '$size': 1}, + }) + # Pull the parent from the parents field + collection.update_many({'symbol': symbol, + 'parent': v}, + {'$pull': {'parent': v}}) + + # Now remove all chunks which aren't parented - this is unlikely, as they will + # have been removed by the above + collection.delete_one({'symbol': symbol, 'parent': {'$size': 0}}) diff --git a/arctic/store/audit.py b/arctic/store/audit.py new file mode 100644 index 000000000..e233a5a4a --- /dev/null +++ b/arctic/store/audit.py @@ -0,0 +1,137 @@ +""" +Handle audited data changes. +""" +from functools import partial + +from pymongo.errors import OperationFailure + +from .._util import are_equals +from ..decorators import _get_host +from ..exceptions import NoDataFoundException, ConcurrentModificationException +from ..logging import logger +from .versioned_item import VersionedItem, ChangedItem + + +class DataChange(object): + """ + Object representing incoming data change + """ + def __init__(self, date_range, new_data): + self.date_range = date_range + self.new_data = new_data + + +class ArcticTransaction(object): + '''Use this context manager if you want to modify data in a version store while ensuring that no other writes + interfere with your own. + + To use, base your modifications on the `base_ts` context manager field and put your newly created timeseries and + call the `write` method of the context manager to output changes. The changes will only be written when the block + exits. + + NB changes are audited. + + Example: + ------- + with ArcticTransaction(Arctic('hostname')['some_library'], 'symbol') as mt: + ts_version_info = mt.base_ts + # do some processing, come up with a new ts for 'symbol' called new_symbol_ts, presumably based on ts_version_info.data + mt.write('symbol', new_symbol_ts, metadata=new_symbol_metadata) + + The block will raise a ConcurrentModificationException if an inconsistency has been detected. You will have to + retry the whole block should that happens, as the assumption is that you need to base your changes on a different + starting timeseries. + ''' + def __init__(self, version_store, symbol, user, log, modify_timeseries=None, *args, **kwargs): + ''' + Parameters + ---------- + version_store: `VersionStore` Arctic Library + Needs to support write, read, list_versions, _delete_version this is the underlying store that we'll + be securing for write + + symbol: `str` + symbol name for the item that's being modified + + user: `str` + user making the change + + log: `str` + Log message for the change + + modify_timeseries: + if given, it will check the assumption that this is the latest data available for symbol in version_store + Should not this be the case, a ConcurrentModificationException will be raised. Use this if you're + interacting with code that read in the data already and for some reason you cannot refactor the read-write + operation to be contained within this context manager + + all other args: + Will be passed into the initial read + ''' + self._version_store = version_store + self._symbol = symbol + self._user = user + self._log = log + logger.info("MT: {}@{}: [{}] {}: {}".format(_get_host(version_store).get('l'), + _get_host(version_store).get('mhost'), + user, log, symbol) + ) + try: + self.base_ts = self._version_store.read(self._symbol, *args, **kwargs) + except NoDataFoundException: + versions = [x['version'] for x in self._version_store.list_versions(self._symbol, latest_only=True)] + versions.append(0) + self.base_ts = VersionedItem(symbol=self._symbol, library=None, + version=versions[0], metadata=None, data=None) + except OperationFailure: + #TODO: Current errors in mongo "Incorrect Number of Segments Returned" + # This workaround should be removed once underlying problem is resolved. + self.base_ts = self._version_store.read_metadata(symbol=self._symbol) + + if modify_timeseries is not None and not are_equals(modify_timeseries, self.base_ts.data): + raise ConcurrentModificationException() + self._do_write = False + + def change(self, symbol, data_changes, **kwargs): + """ + Change, and audit 'data' under the specified 'symbol' name to this library. + + Parameters + ---------- + symbol: `str` + symbol name for the item + + data_changes: `list DataChange` + list of DataChange objects + """ + pass + + def write(self, symbol, data, prune_previous_version=True, metadata=None, **kwargs): + '''Records a write request to be actioned on context exit. Takes exactly the same parameters as the regular + library write call. + ''' + if data is not None: + # We only write data if existing data is None or the Timeseries data has changed or metadata has changed + if self.base_ts.data is None or not are_equals(data, self.base_ts.data) or metadata != self.base_ts.metadata: + self._do_write = True + self._write = partial(self._version_store.write, symbol, data, prune_previous_version=prune_previous_version, + metadata=metadata, **kwargs) + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + if self._do_write: + written_ver = self._write() + versions = [x['version'] for x in self._version_store.list_versions(self._symbol)] + versions.append(0) + versions.reverse() + base_offset = versions.index(self.base_ts.version) + new_offset = versions.index(written_ver.version) + if len(versions[base_offset: new_offset + 1]) != 2: + self._version_store._delete_version(self._symbol, written_ver.version) + raise ConcurrentModificationException("Inconsistent Versions: {}: {}->{}".format( + self._symbol, self.base_ts.version, written_ver.version)) + + changed = ChangedItem(self._symbol, self.base_ts, written_ver, None) + self._version_store._write_audit(self._user, self._log, changed) diff --git a/arctic/store/version_store.py b/arctic/store/version_store.py new file mode 100644 index 000000000..76a41bf92 --- /dev/null +++ b/arctic/store/version_store.py @@ -0,0 +1,867 @@ +from datetime import datetime as dt, timedelta +import pprint + +import bson +from pymongo import ReadPreference +import pymongo +from pymongo.errors import OperationFailure, AutoReconnect + +from .._util import indent, enable_powerof2sizes, \ + enable_sharding +from ..date import mktz, datetime_to_ms, ms_to_datetime +from ..decorators import mongo_retry +from ..exceptions import NoDataFoundException, DuplicateSnapshotException, \ + OptimisticLockException, ArcticException +from ..hooks import log_exception +from ..logging import logger +from ._pickle_store import PickleStore +from ._version_store_utils import cleanup +from .versioned_item import VersionedItem + + +VERSION_STORE_TYPE = 'VersionStore' +_TYPE_HANDLERS = [] + + +def register_versioned_storage(storageClass): + existing_instances = [i for i, v in enumerate(_TYPE_HANDLERS) if str(v.__class__) == str(storageClass)] + if existing_instances: + for i in existing_instances: + _TYPE_HANDLERS[i] = storageClass() + else: + _TYPE_HANDLERS.append(storageClass()) + return storageClass + + + +class VersionStore(object): + + _bson_handler = PickleStore() + + @classmethod + def initialize_library(cls, arctic_lib, hashed=False, **kwargs): + c = arctic_lib.get_top_level_collection() + + if '%s.changes' % c.name not in mongo_retry(c.database.collection_names)(): + # 32MB buffer for change notifications + mongo_retry(c.database.create_collection)('%s.changes' % c.name, capped=True, size=32 * 1024 * 1024) + + for th in _TYPE_HANDLERS: + th.initialize_library(arctic_lib, **kwargs) + VersionStore._bson_handler.initialize_library(arctic_lib, **kwargs) + VersionStore(arctic_lib)._ensure_index() + + logger.info("Trying to enable usePowerOf2Sizes...") + try: + enable_powerof2sizes(arctic_lib.arctic, arctic_lib.get_name()) + except OperationFailure, e: + logger.error("Library created, but couldn't enable usePowerOf2Sizes: %s" % str(e)) + + logger.info("Trying to enable sharding...") + try: + enable_sharding(arctic_lib.arctic, arctic_lib.get_name(), hashed=hashed) + except OperationFailure, e: + logger.warn("Library created, but couldn't enable sharding: %s. This is OK if you're not 'admin'" % str(e)) + + @mongo_retry + def _ensure_index(self): + collection = self._collection + collection.snapshots.create_index([('name', pymongo.ASCENDING)], unique=True, + background=True) + collection.versions.create_index([('symbol', pymongo.ASCENDING), ('_id', pymongo.DESCENDING)], + background=True) + collection.versions.create_index([('symbol', pymongo.ASCENDING), ('version', pymongo.DESCENDING)], unique=True, + background=True) + collection.version_nums.create_index('symbol', unique=True, background=True) + for th in _TYPE_HANDLERS: + th._ensure_index(collection) + + @mongo_retry + def __init__(self, arctic_lib): + self._arctic_lib = arctic_lib + + # Do we allow reading from secondaries + self._allow_secondary = self._arctic_lib.arctic._allow_secondary + + # The default collections + self._collection = arctic_lib.get_top_level_collection() + self._audit = self._collection.audit + self._snapshots = self._collection.snapshots + self._versions = self._collection.versions + self._version_nums = self._collection.version_nums + self._publish_changes = '%s.changes' % self._collection.name in self._collection.database.collection_names() + if self._publish_changes: + self._changes = self._collection.changes + + def __getstate__(self): + return {'arctic_lib': self._arctic_lib} + + def __setstate__(self, state): + return VersionStore.__init__(self, state['arctic_lib']) + + def __str__(self): + return """<%s at %s> +%s""" % (self.__class__.__name__, hex(id(self)), indent(str(self._arctic_lib), 4)) + + def __repr__(self): + return str(self) + + @mongo_retry + def list_symbols(self, all_symbols=False, snapshot=None, regex=None, **kwargs): + """ + Return the symbols in this library. + + Parameters + ---------- + all_symbols : `bool` + If True returns all symbols under all snapshots, even if the symbol has been deleted + in the current version (i.e. it exists under a snapshot... Default: False + snapshot : `str` + Return the symbols available under the snapshot. + regex : `str` + filter symbols by the passed in regular expression + kwargs : + kwarg keys are used as fields to query for symbols with metadata matching + the kwargs query + + Returns + ------- + String list of symbols in the library + """ + query = {} + if regex is not None: + query ['symbol'] = {'$regex' : regex} + if kwargs: + for k, v in kwargs.iteritems(): + query['metadata.' + k] = v + if snapshot is not None: + try: + query['parent'] = self._snapshots.find_one({'name': snapshot})['_id'] + except TypeError: + raise NoDataFoundException('No snapshot %s in library %s' % (snapshot, self._arctic_lib.get_name())) + elif all_symbols: + return self._versions.find(query).distinct('symbol') + + # Return just the symbols which aren't deleted in the 'trunk' of this library + pipeline = [] + if query: + # Match based on user criteria first + pipeline.append({'$match': query}) + pipeline.extend([ + # Id is by insert time which matches version order + {'$sort': {'_id':-1}}, + # Group by 'symbol' + {'$group': {'_id': '$symbol', + 'deleted': {'$first': '$metadata.deleted'}, + }, + }, + # Don't include symbols which are part of some snapshot, but really deleted... + {'$match': {'deleted': {'$ne': True}}}, + {'$project': {'_id': 0, + 'symbol': '$_id', + } + }]) + + results = self._versions.aggregate(pipeline) + return sorted([x['symbol'] for x in results]) + + @mongo_retry + def has_symbol(self, symbol, as_of=None): + """ + Return True if the 'symbol' exists in this library AND the symbol + isn't deleted in the specified as_of. + + It's possible for a deleted symbol to exist in older snapshots. + + Parameters + ---------- + symbol : `str` + symbol name for the item + """ + try: + self._read_metadata(symbol, as_of=as_of) + return True + except NoDataFoundException: + return False + + def read_audit_log(self, symbol): + """ + Return the audit log associated with a given symbol + + Parameters + ---------- + symbol : `str` + symbol name for the item + """ + query = {'symbol': symbol} + return list(self._audit.find(query, sort=[('_id', -1)], + projection={'_id': False})) + + def list_versions(self, symbol=None, snapshot=None, latest_only=False): + """ + Return a list of versions filtered by the passed in parameters. + + Parameters + ---------- + symbol : `str` + Symbol to return versions for. If None returns versions across all + symbols in the library. + snapshot : `str` + Return the versions contained in the named snapshot + latest_only : `bool` + Only include the latest version for a specific symbol + + Returns + ------- + List of dictionaries describing the discovered versions in the library + """ + if symbol is None: + symbols = self.list_symbols() + else: + symbols = [symbol] + + query = {} + + if snapshot is not None: + try: + query['parent'] = self._snapshots.find_one({'name': snapshot})['_id'] + except TypeError: + raise NoDataFoundException('No snapshot %s in library %s' % (snapshot, self._arctic_lib.get_name())) + + versions = [] + for symbol in symbols: + query['symbol'] = symbol + seen_symbols = set() + for version in self._versions.find(query, projection=['symbol', 'version', 'parent'], sort=[('version', -1)]): + if latest_only and version['symbol'] in seen_symbols: + continue + seen_symbols.add(version['symbol']) + versions.append({'symbol': version['symbol'], 'version': version['version'], + # We return naive datetimes in London Time. + 'date': ms_to_datetime(datetime_to_ms(version['_id'].generation_time)), + 'snapshots': self._find_snapshots(version.get('parent', []))}) + return versions + + def _find_snapshots(self, parent_ids): + snapshots = [] + for p in parent_ids: + snap = self._snapshots.find_one({'_id': p}) + if snap: + snapshots.append(snap['name']) + else: + snapshots.append(str(p)) + return snapshots + + def _read_handler(self, version, symbol): + handler = None + for h in _TYPE_HANDLERS: + if h.can_read(version, symbol): + handler = h + break + if handler is None: + handler = self._bson_handler + return handler + + def _write_handler(self, version, symbol, data, **kwargs): + handler = None + for h in _TYPE_HANDLERS: + if h.can_write(version, symbol, data, **kwargs): + handler = h + break + if handler is None: + version['type'] = 'default' + handler = self._bson_handler + return handler + + def read(self, symbol, as_of=None, from_version=None, **kwargs): + """ + Read data for the named symbol. Returns a VersionedItem object with + a data and metdata element (as passed into write). + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + + Returns + ------- + VersionedItem namedtuple which contains a .data and .metadata element + """ + try: + _version = self._read_metadata(symbol, as_of=as_of) + read_preference = ReadPreference.NEAREST if self._allow_secondary else None + return self._do_read(symbol, _version, from_version, read_preference=read_preference, **kwargs) + except (OperationFailure, AutoReconnect) as e: + # Log the exception so we know how often this is happening + log_exception('read', e, 1) + # If we've failed to read from the secondary, then it's possible the + # secondary has lagged. In this case direct the query to the primary. + _version = mongo_retry(self._read_metadata)(symbol, as_of=as_of, + read_preference=ReadPreference.PRIMARY) + return self._do_read_retry(symbol, _version, from_version, + read_preference=ReadPreference.PRIMARY, + **kwargs) + except Exception, e: + log_exception('read', e, 1) + raise + + @mongo_retry + def _show_info(self, symbol, as_of=None): + """ + Print details on the stored symbol: the underlying storage handler + and the version_document corresponding to the specified version. + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + """ + print self._get_info(symbol, as_of) + + def _get_info(self, symbol, as_of=None): + _version = self._read_metadata(symbol, as_of=as_of) + handler = self._read_handler(_version, symbol) + if hasattr(handler, "get_info"): + return handler.get_info(self._arctic_lib, _version, symbol) + else: + return """Handler: %s\n\nVersion document:\n%s""" % (handler.__class__.__name__, pprint.pformat(_version)) + + def _do_read(self, symbol, version, from_version=None, **kwargs): + handler = self._read_handler(version, symbol) + data = handler.read(self._arctic_lib, version, symbol, from_version=from_version, **kwargs) + if data is None: + raise NoDataFoundException("No data found for %s in library %s" % (symbol, self._arctic_lib.get_name())) + return VersionedItem(symbol=symbol, library=self._arctic_lib.get_name(), version=version['version'], + metadata=version.pop('metadata', None), data=data) + _do_read_retry = mongo_retry(_do_read) + + @mongo_retry + def read_metadata(self, symbol, as_of=None): + """ + Return the metadata saved for a symbol. This method is fast as it doesn't + actually load the data. + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + """ + _version = self._read_metadata(symbol, as_of=as_of) + return VersionedItem(symbol=symbol, library=self._arctic_lib.get_name(), version=_version['version'], + metadata=_version.pop('metadata', None), data=None) + + def _read_metadata(self, symbol, as_of=None, read_preference=None): + if read_preference is None: + # We want to hit the PRIMARY if querying secondaries is disabled. If we're allowed to query secondaries, + # then we want to hit the secondary for metadata. We maintain ordering of chunks vs. metadata, such that + # if metadata is available, we guarantee that chunks will be available. (Within a 10 minute window.) + read_preference = ReadPreference.PRIMARY_PREFERRED if not self._allow_secondary else ReadPreference.SECONDARY_PREFERRED + + versions_coll = self._versions.with_options(read_preference=read_preference) + + _version = None + if as_of is None: + _version = versions_coll.find_one({'symbol': symbol}, sort=[('version', pymongo.DESCENDING)]) + elif isinstance(as_of, basestring): + # as_of is a snapshot + snapshot = self._snapshots.find_one({'name': as_of}) + if snapshot: + _version = versions_coll.find_one({'symbol': symbol, 'parent': snapshot['_id']}) + elif isinstance(as_of, dt): + # as_of refers to a datetime + if not as_of.tzinfo: + as_of = as_of.replace(tzinfo=mktz()) + _version = versions_coll.find_one({'symbol': symbol, + '_id': {'$lt': bson.ObjectId.from_datetime(as_of + timedelta(seconds=1))}}, + sort=[('_id', pymongo.DESCENDING)]) + else: + # Backward compatibility - as of is a version number + _version = versions_coll.find_one({'symbol': symbol, 'version': as_of}) + + if not _version: + raise NoDataFoundException("No data found for %s in library %s" % (symbol, self._arctic_lib.get_name())) + + # if the item has been deleted, don't return any metadata + metadata = _version.get('metadata', None) + if metadata is not None and metadata.get('deleted', False) is True: + raise NoDataFoundException("No data found for %s in library %s" % (symbol, self._arctic_lib.get_name())) + + return _version + + @mongo_retry + def append(self, symbol, data, metadata=None, prune_previous_version=True, upsert=True, **kwargs): + """ + Append 'data' under the specified 'symbol' name to this library. + The exact meaning of 'append' is left up to the underlying store implementation. + + Parameters + ---------- + symbol : `str` + symbol name for the item + data : + to be persisted + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + upsert : `bool` + Write 'data' if no previous version exists. + """ + self._ensure_index() + self._arctic_lib.check_quota() + version = {'_id': bson.ObjectId()} + version['symbol'] = symbol + spec = {'symbol': symbol} + previous_version = self._versions.find_one(spec, + sort=[('version', pymongo.DESCENDING)]) + + if len(data) == 0 and previous_version is not None: + return VersionedItem(symbol=symbol, library=self._arctic_lib.get_name(), version=previous_version, + metadata=version.pop('metadata', None), data=None) + + if upsert and previous_version is None: + return self.write(symbol=symbol, data=data, prune_previous_version=prune_previous_version, metadata=metadata) + + assert previous_version is not None + + next_ver = self._version_nums.find_one({'symbol': symbol, 'version': previous_version['version']}) + + if next_ver is None: + raise ArcticException('''version_nums is out of sync with previous version document. + This probably means that either a version document write has previously failed, or the previous version has been deleted. + Append not possible - please call write() to get versions back in sync''') + + # if the symbol has previously been deleted then overwrite + previous_metadata = previous_version.get('metadata', None) + if upsert and previous_metadata is not None and previous_metadata.get('deleted', False) is True: + return self.write(symbol=symbol, data=data, prune_previous_version=prune_previous_version, + metadata=metadata) + + handler = self._read_handler(previous_version, symbol) + + if metadata is not None: + version['metadata'] = metadata + elif 'metadata' in previous_version: + version['metadata'] = previous_version['metadata'] + + if handler and hasattr(handler, 'append'): + mongo_retry(handler.append)(self._arctic_lib, version, symbol, data, previous_version, **kwargs) + else: + raise Exception("Append not implemented for handler %s" % handler) + + next_ver = self._version_nums.find_one_and_update({'symbol': symbol, 'version': previous_version['version']}, + {'$inc': {'version': 1}}, + upsert=False, new=True) + + if next_ver is None: + #Latest version has changed during this operation + raise OptimisticLockException() + + version['version'] = next_ver['version'] + + # Insert the new version into the version DB + mongo_retry(self._versions.insert_one)(version) + + self._publish_change(symbol, version) + + if prune_previous_version and previous_version: + self._prune_previous_versions(symbol) + + return VersionedItem(symbol=symbol, library=self._arctic_lib.get_name(), version=version['version'], + metadata=version.pop('metadata', None), data=None) + + def _publish_change(self, symbol, version): + if self._publish_changes: + mongo_retry(self._changes.insert_one)(version) + + @mongo_retry + def write(self, symbol, data, metadata=None, prune_previous_version=True, **kwargs): + """ + Write 'data' under the specified 'symbol' name to this library. + + Parameters + ---------- + symbol : `str` + symbol name for the item + data : + to be persisted + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + Default: None + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + kwargs : + passed through to the write handler + + Returns + ------- + VersionedItem named tuple containing the metadata and verison number + of the written symbol in the store. + """ + self._ensure_index() + self._arctic_lib.check_quota() + version = {'_id': bson.ObjectId()} + version['symbol'] = symbol + version['version'] = self._version_nums.find_one_and_update({'symbol': symbol}, + {'$inc': {'version': 1}}, + upsert=True, new=True)['version'] + version['metadata'] = metadata + + previous_version = self._versions.find_one({'symbol': symbol, 'version': {'$lt': version['version']}}, + sort=[('version', pymongo.DESCENDING)], + ) + + handler = self._write_handler(version, symbol, data, **kwargs) + mongo_retry(handler.write)(self._arctic_lib, version, symbol, data, previous_version, **kwargs) + + # Insert the new version into the version DB + mongo_retry(self._versions.insert_one)(version) + + if prune_previous_version and previous_version: + self._prune_previous_versions(symbol) + + logger.debug('Finished writing versions for %s', symbol) + + self._publish_change(symbol, version) + + return VersionedItem(symbol=symbol, library=self._arctic_lib.get_name(), version=version['version'], + metadata=version.pop('metadata', None), data=None) + + def _prune_previous_versions(self, symbol, keep_mins=120): + """ + Prune versions, not pointed at by snapshots which are at least keep_mins old. + """ + # Find all non-snapshotted versions older than a version that's at least keep_mins minutes old + # Based on documents available on the secondary + versions_find = mongo_retry(self._versions.with_options(read_preference=ReadPreference.SECONDARY_PREFERRED if keep_mins > 0 else + ReadPreference.PRIMARY) + .find) + versions = list(versions_find({ # Find versions of this symbol + 'symbol': symbol, + # Not snapshotted + '$or': [{'parent': {'$exists': False}}, {'parent': {'$size': 0}}], + # At least 'keep_mins' old + '_id': {'$lt': bson.ObjectId.from_datetime( + dt.utcnow() + # Add one second as the ObjectId str has random fuzz + + timedelta(seconds=1) + - timedelta(minutes=keep_mins)) + } + }, + # Using version number here instead of _id as there's a very unlikely case + # where the versions are created on different hosts or processes at exactly + # the same time. + sort=[('version', pymongo.DESCENDING)], + # Keep one, that's at least 10 mins old, around + # (cope with replication delay) + skip=1, + projection=['_id', 'type'], + )) + if not versions: + return + version_ids = [v['_id'] for v in versions] + + #Find any version_ids that are the basis of other, 'current' versions - don't prune these. + base_versions = set([x['base_version_id'] for x in mongo_retry(self._versions.find)({ + 'symbol': symbol, + '_id': {'$nin': version_ids}, + 'base_version_id':{'$exists':True}, + }, + projection=['base_version_id'], + )]) + + version_ids = list(set(version_ids) - base_versions) + + if not version_ids: + return + + # Delete the version documents + mongo_retry(self._versions.delete_many)({'_id': {'$in': version_ids}}) + # Cleanup any chunks + cleanup(self._arctic_lib, symbol, version_ids) + + @mongo_retry + def _delete_version(self, symbol, version_num, do_cleanup=True): + """ + Delete the n'th version of this symbol from the historical collection. + """ + version = self._versions.find_one({'symbol': symbol, 'version': version_num}) + if not version: + logger.error("Can't delete %s:%s as not found in DB" % (symbol, version_num)) + return + # If the version is pointed to by a snapshot, then can't delete + if version.get('parent', None): + for parent in version['parent']: + snap_name = self._snapshots.find_one({'_id': parent}) + if snap_name: + snap_name = snap_name['name'] + logger.error("Can't delete: %s:%s as pointed to by snapshot: %s" % (symbol, version['version'], + snap_name)) + return + self._versions.delete_one({'_id': version['_id']}) + if do_cleanup: + cleanup(self._arctic_lib, symbol, [version['_id']]) + + @mongo_retry + def delete(self, symbol): + """ + Delete all versions of the item from the current library which aren't + currently part of some snapshot. + + Parameters + ---------- + symbol : `str` + symbol name to delete + """ + logger.warn("Deleting data item: %r from %r" % (symbol, self._arctic_lib.get_name())) + # None is the magic sentinel value that indicates an item has been deleted. + sentinel = self.write(symbol, None, prune_previous_version=False, metadata={'deleted': True}) + self._prune_previous_versions(symbol, 0) + + # If there aren't any other versions, then we don't need the sentinel empty value + # so delete the sentinel version altogether + snapped_version = self._versions.find_one({'symbol': symbol, + 'metadata.deleted': {'$ne': True}}) + if not snapped_version: + self._delete_version(symbol, sentinel.version) + assert not self.has_symbol(symbol) + + def _write_audit(self, user, message, changed_version): + """ + Creates an audit entry, which is much like a snapshot in that + it references versions and provides some history of the changes made. + """ + audit = {'_id': bson.ObjectId(), + 'user': user, + 'message': message, + 'symbol': changed_version.symbol + } + orig_version = changed_version.orig_version.version + new_version = changed_version.new_version.version + audit['orig_v'] = orig_version + audit['new_v'] = new_version + # Update the versions to contain the audit + mongo_retry(self._versions.update_many)({'symbol': changed_version.symbol, + 'version': {'$in': [orig_version, new_version]} + }, + {'$addToSet': {'parent': audit['_id']}}) + # Create the audit entry + mongo_retry(self._audit.insert_one)(audit) + + def snapshot(self, snap_name, metadata=None, skip_symbols=None): + """ + Snapshot the current versions of symbols in the library. Can be used like: + + Parameters + ---------- + snap_name : `str` + name of the snapshot + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + skip_symbols : `collections.Iterable` + optional symbols to be excluded from the snapshot + """ + # Ensure the user doesn't insert duplicates + snapshot = self._snapshots.find_one({'name': snap_name}) + if snapshot: + raise DuplicateSnapshotException("Snapshot '%s' already exists." % snap_name) + + # Create a snapshot version document + snapshot = {'_id': bson.ObjectId()} + snapshot['name'] = snap_name + snapshot['metadata'] = metadata + + if skip_symbols is None: + skip_symbols = set() + else: + skip_symbols = set(skip_symbols) + + # Loop over, and snapshot all versions except those we've been asked to skip + for sym in set(self.list_symbols()) - skip_symbols: + try: + sym = self._read_metadata(sym, read_preference=ReadPreference.PRIMARY) + # Update the parents field of the version document + mongo_retry(self._versions.update_one)({'_id': sym['_id']}, + {'$addToSet': {'parent': snapshot['_id']}}) + except NoDataFoundException: + # Version has been deleted, not included in the snapshot + pass + mongo_retry(self._snapshots.insert_one)(snapshot) + + def delete_snapshot(self, snap_name): + """ + Delete a named snapshot + + Parameters + ---------- + symbol : `str` + The snapshot name to delete + """ + snapshot = self._snapshots.find_one({'name': snap_name}) + if not snapshot: + raise NoDataFoundException("Snapshot %s not found!" % snap_name) + + # Find all the versions pointed at by the snapshot + versions = list(self._versions + .find({'parent': snapshot['_id']}, projection=['symbol', 'version'])) + # Remove the snapshot Id as a parent of versions + self._versions.update_many({'parent': snapshot['_id']}, + {'$pull': {'parent': snapshot['_id']}}) + + self._snapshots.delete_one({'name': snap_name}) + + def list_snapshots(self): + """ + List the snapshots in the library + + Returns + ------- + string list of snapshot names + """ + return dict((i['name'], i['metadata']) for i in self._snapshots.find()) + + def stats(self): + """ + Return storage statistics about the library + + Returns + ------- + dictionary of storage stats + """ + + res = {} + db = self._collection.database + conn = db.connection + res['sharding'] = {} + try: + sharding = conn.config.databases.find_one({'_id': db.name}) + if sharding: + res['sharding'].update(sharding) + res['sharding']['collections'] = list(conn.config.collections.find({'_id': {'$regex': '^' + db.name + "\..*"}})) + except OperationFailure: + # Access denied + pass + res['dbstats'] = db.command('dbstats') + res['chunks'] = db.command('collstats', self._collection.name) + res['versions'] = db.command('collstats', self._versions.name) + res['snapshots'] = db.command('collstats', self._snapshots.name) + res['totals'] = {'count': res['chunks']['count'], + 'size': res['chunks']['size'] + res['versions']['size'] + res['snapshots']['size'], + } + return res + + def _fsck(self, dry_run): + """ + Run a consistency check on this VersionStore library. + """ + # Cleanup Orphaned Chunks + self._cleanup_orphaned_chunks(dry_run) + # Cleanup Orphaned Snapshots + self._cleanup_orphaned_versions(dry_run) + + def _cleanup_orphaned_chunks(self, dry_run): + """ + Fixes any chunks who have parent pointers to missing versions. + Removes the broken parent pointer and, if there are no other parent pointers for the chunk, + removes the chunk. + """ + lib = self + chunks_coll = lib._collection + versions_coll = chunks_coll.versions + + logger.info("ORPHANED CHUNK CHECK: %s" % self._arctic_lib.get_name()) + for symbol in chunks_coll.distinct('symbol'): + logger.debug('Checking %s' % symbol) + # Be liberal with the generation time. + gen_time = dt.now() - timedelta(days=1) + parent_id_constraint = {'$lt': bson.ObjectId.from_datetime(gen_time)} + + # For each symbol, grab all 'real' versions + versions = set(versions_coll.find({'symbol': symbol, + '_id': parent_id_constraint}).distinct('_id')) + # Using aggregate so we can unwind, and pull out 'parent', where 'parent' is older than a day. + parents = chunks_coll.aggregate([{'$match': {'symbol': symbol}}, + {'$project': {'parent': True}}, + {'$unwind': '$parent'}, + {'$match': {'parent': parent_id_constraint}}, + {'$group': {'_id': '$parent'}}, + ]) + parent_ids = set([x['_id'] for x in parents]) + + leaked_versions = sorted(parent_ids - versions) + if len(leaked_versions): + logger.info("%s leaked %d versions" % (symbol, len(leaked_versions))) + for x in leaked_versions: + chunk_count = chunks_coll.find({'symbol': symbol, 'parent': x}).count() + logger.info("%s: Missing Version %s (%s) ; %s chunks ref'd" % (symbol, + x.generation_time, + x, + chunk_count + )) + if versions_coll.find_one({'symbol': symbol, '_id': x}) is not None: + raise Exception("Error: version (%s) is found for (%s), but shouldn't be!" % + (x, symbol)) + # Now cleanup the leaked versions + if not dry_run: + cleanup(lib._arctic_lib, symbol, leaked_versions) + + def _cleanup_orphaned_versions(self, dry_run): + """ + Fixes any versions who have parent pointers to missing snapshots. + Note, doesn't delete the versions, just removes the parent pointer if it no longer + exists in snapshots. + """ + lib = self + versions_coll = lib._collection.versions + snapshots_coll = lib._collection.snapshots + + logger.info("ORPHANED SNAPSHOT CHECK: %s" % self._arctic_lib.get_name()) + + # Be liberal with the generation time. + gen_time = dt.now() - timedelta(days=1) + parent_id_constraint = {'$lt': bson.ObjectId.from_datetime(gen_time)} + + # For each symbol, grab all 'real' snapshots and audit entries + snapshots = set(snapshots_coll.distinct('_id')) + snapshots |= set(lib._audit.distinct('_id')) + # Using aggregate so we can unwind, and pull out 'parent', where 'parent' is older than a day. + parents = versions_coll.aggregate([{'$project': {'parent': True}}, + {'$unwind': '$parent'}, + {'$match': {'parent': parent_id_constraint}}, + {'$group': {'_id': '$parent'}}, + ]) + parent_ids = set([x['_id'] for x in parents]) + + leaked_snaps = sorted(parent_ids - snapshots) + if len(leaked_snaps): + logger.info("leaked %d snapshots" % (len(leaked_snaps))) + for x in leaked_snaps: + ver_count = versions_coll.find({'parent': x}).count() + logger.info("Missing Snapshot %s (%s) ; %s versions ref'd" % (x.generation_time, + x, + ver_count + )) + if snapshots_coll.find_one({'_id': x}) is not None: + raise Exception("Error: snapshot (%s) is found, but shouldn't be!" % + (x)) + # Now cleanup the leaked snapshots + if not dry_run: + versions_coll.update_many({'parent': x}, + {'$pull': {'parent': x}}) diff --git a/arctic/store/versioned_item.py b/arctic/store/versioned_item.py new file mode 100644 index 000000000..25c7594ba --- /dev/null +++ b/arctic/store/versioned_item.py @@ -0,0 +1,19 @@ +from collections import namedtuple + + +class VersionedItem(namedtuple('VersionedItem', ['symbol', 'library', 'data', 'version', 'metadata'])): + """ + Class representing a Versioned object in VersionStore. + """ + def metadata_dict(self): + return {'symbol': self.symbol, 'library': self.library, 'version': self.version} + + def __repr__(self): + return str(self) + + def __str__(self): + return "VersionedItem(symbol=%s,library=%s,data=%s,version=%s,metadata=%s" % \ + (self.symbol, self.library, type(self.data), self.version, self.metadata) + + +ChangedItem = namedtuple('ChangedItem', ['symbol', 'orig_version', 'new_version', 'changes']) diff --git a/arctic/tickstore/__init__.py b/arctic/tickstore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/arctic/tickstore/tickstore.py b/arctic/tickstore/tickstore.py new file mode 100644 index 000000000..997914632 --- /dev/null +++ b/arctic/tickstore/tickstore.py @@ -0,0 +1,604 @@ +from bson.binary import Binary +from datetime import datetime as dt, timedelta +import lz4 +import numpy as np +import pandas as pd +from pandas.core.frame import _arrays_to_mgr +import pymongo +from pymongo.errors import OperationFailure +import pytz + +from ..date import DateRange, to_pandas_closed_closed, mktz, datetime_to_ms, ms_to_datetime +from ..decorators import mongo_retry +from ..exceptions import OverlappingDataException, \ + NoDataFoundException, UnhandledDtypeException, ArcticException +from ..logging import logger +from .._util import indent + + +# Example-Schema: +# -------------- +# {ID: ObjectId('52b1d39eed5066ab5e87a56d'), +# SYMBOL: u'symbol' +# INDEX: Binary('...', 0), +# IMAGE_DOC: { IMAGE: { +# 'ASK': 10. +# ... +# } +# 's': +# 't': DateTime(...) +# } +# COLUMNS: { +# 'ACT_FLAG1': { +# DATA: Binary('...', 0), +# DTYPE: u'U1', +# ROWMASK: Binary('...', 0)}, +# 'ACVOL_1': { +# DATA: Binary('...', 0), +# DTYPE: u'float64', +# ROWMASK: Binary('...', 0)}, +# ... +# } +# START: DateTime(...), +# END: DateTime(...), +# END_SEQ: 31553879L, +# SEGMENT: 1386933906826L, +# SHA: 1386933906826L, +# VERSION: 3, +# } + +TICK_STORE_TYPE = 'TickStoreV3' + +ID = '_id' +SYMBOL = 'sy' +INDEX = 'i' +START = 's' +END = 'e' +START_SEQ = 'sS' +END_SEQ = 'eS' +SEGMENT = 'se' +SHA = 'sh' +IMAGE_DOC = 'im' +IMAGE = 'i' + +COLUMNS = 'cs' +DATA = 'd' +DTYPE = 't' +ROWMASK = 'm' + +COUNT = 'c' +VERSION = 'v' + +CHUNK_VERSION_NUMBER = 3 + + +class TickStore(object): + + chunk_size = 100000 + + @classmethod + def initialize_library(cls, arctic_lib, **kwargs): + TickStore(arctic_lib)._ensure_index() + + @mongo_retry + def _ensure_index(self): + collection = self._collection + collection.create_index([(SYMBOL, pymongo.ASCENDING), + (START, pymongo.ASCENDING)], background=True) + collection.create_index([(START, pymongo.ASCENDING)], background=True) + + def __init__(self, arctic_lib): + self._arctic_lib = arctic_lib + + # Do we allow reading from secondaries + self._allow_secondary = self._arctic_lib.arctic._allow_secondary + + # The default collections + self._collection = arctic_lib.get_top_level_collection() + + def __getstate__(self): + return {'arctic_lib': self._arctic_lib} + + def __setstate__(self, state): + return TickStore.__init__(self, state['arctic_lib']) + + def __str__(self): + return """<%s at %s> +%s""" % (self.__class__.__name__, hex(id(self)), indent(str(self._arctic_lib), 4)) + + def __repr__(self): + return str(self) + + def delete(self, symbol, date_range=None): + """ + Delete all chunks for a symbol. + + Which are, for the moment, fully contained in the passed in + date_range. + + Parameters + ---------- + symbol : `str` + symbol name for the item + date_range : `date.DateRange` + DateRange to delete ticks in + """ + query = {SYMBOL: symbol} + date_range = to_pandas_closed_closed(date_range) + if date_range is not None: + assert date_range.start and date_range.end + if date_range.start: + start = self._to_dt(date_range.start) + if date_range.end: + end = self._to_dt(date_range.end) + query[START] = {'$gte': start} + query[END] = {'$lte': end} + self._collection.delete_many(query) + + def list_symbols(self, date_range=None): + return self._collection.distinct(SYMBOL) + + def _mongo_date_range_query(self, symbol, date_range): + # Handle date_range + if not date_range: + date_range = DateRange() + + # Find the start bound + start_range = {} + first = last = None + if date_range.start: + start = date_range.start + startq = self._symbol_query(symbol) + startq.update({START: {'$lte': start}}) + first = self._collection.find_one(startq, + # Service entirely from the index + projection={START: 1, ID: 0}, + sort=[(START, pymongo.DESCENDING)]) + if first: + start_range['$gte'] = first[START] + + # Find the end bound + if date_range.end: + end = date_range.end + endq = self._symbol_query(symbol) + endq.update({START: {'$gt': end}}) + last = self._collection.find_one(endq, + # Service entirely from the index + projection={START: 1, ID: 0}, + sort=[(START, pymongo.ASCENDING)]) + else: + logger.info("No end provided. Loading a month for: {}:{}".format(symbol, first)) + if not first: + first = self._collection.find_one(self._symbol_query(symbol), + projection={START: 1, ID: 0}, + sort=[(START, pymongo.ASCENDING)]) + if not first: + raise NoDataFoundException() + last = first[START] + last = {START: last + timedelta(days=30)} + if last: + start_range['$lt'] = last[START] + + # Return chunks in the specified range + if not start_range: + return {} + return {START: start_range} + + def _symbol_query(self, symbol): + if isinstance(symbol, basestring): + query = {SYMBOL: symbol} + elif symbol is not None: + query = {SYMBOL: {'$in': symbol}} + else: + query = {} + return query + + def read(self, symbol, date_range=None, columns=None, include_images=False, _target_tick_count=0): + """ + Read data for the named symbol. Returns a VersionedItem object with + a data and metdata element (as passed into write). + + Parameters + ---------- + symbol : `str` + symbol name for the item + date_range : `date.DateRange` + Returns ticks in the specified DateRange + columns : `list` of `str` + Columns (fields) to return from the tickstore + include_images : `bool` + Should images (/snapshots) be included in the read + Returns + ------- + pandas.DataFrame of data + """ + perf_start = dt.now() + rtn = {} + column_set = set() + + multiple_symbols = not isinstance(symbol, basestring) + + date_range = to_pandas_closed_closed(date_range) + query = self._symbol_query(symbol) + query.update(self._mongo_date_range_query(symbol, date_range)) + + if columns: + projection = dict([(SYMBOL, 1), + (INDEX, 1), + (START, 1), + (VERSION, 1), + (IMAGE_DOC, 1)] + + [(COLUMNS + '.%s' % c, 1) for c in columns]) + column_set.update([c for c in columns if c != 'SYMBOL']) + else: + projection = dict([(SYMBOL, 1), + (INDEX, 1), + (START, 1), + (VERSION, 1), + (COLUMNS, 1), + (IMAGE_DOC, 1)]) + + column_dtypes = {} + ticks_read = 0 + for b in self._collection.find(query, projection=projection).sort([(START, pymongo.ASCENDING)],): + data = self._read_bucket(b, column_set, column_dtypes, + multiple_symbols or (columns is not None and 'SYMBOL' in columns), + include_images) + for k, v in data.iteritems(): + try: + rtn[k].append(v) + except KeyError: + rtn[k] = [v] + # For testing + ticks_read += len(data[INDEX]) + if _target_tick_count and ticks_read > _target_tick_count: + break + + if not rtn: + raise NoDataFoundException("No Data found for {} in range: {}".format(symbol, date_range)) + rtn = self._pad_and_fix_dtypes(rtn, column_dtypes) + + index = pd.to_datetime(np.concatenate(rtn[INDEX]), unit='ms') + if columns is None: + columns = [x for x in rtn.keys() if x not in (INDEX, 'SYMBOL')] + if multiple_symbols and 'SYMBOL' not in columns: + columns = ['SYMBOL', ] + columns + + if len(index) > 0: + arrays = [np.concatenate(rtn[k]) for k in columns] + else: + arrays = [[] for k in columns] + + if multiple_symbols: + sort = np.argsort(index) + index = index[sort] + arrays = [a[sort] for a in arrays] + + t = (dt.now() - perf_start).total_seconds() + logger.info("Got data in %s secs, creating DataFrame..." % t) + mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=None) + rtn = pd.DataFrame(mgr) + + t = (dt.now() - perf_start).total_seconds() + ticks = len(rtn) + logger.info("%d rows in %s secs: %s ticks/sec" % (ticks, t, int(ticks / t))) + if not rtn.index.is_monotonic: + logger.error("TimeSeries data is out of order, sorting!") + rtn = rtn.sort_index() + if date_range: + # FIXME: support DateRange.interval... + rtn = rtn.ix[date_range.start:date_range.end] + return rtn + + def _pad_and_fix_dtypes(self, cols, column_dtypes): + # Pad out Nones with empty arrays of appropriate dtypes + rtn = {} + index = cols[INDEX] + full_length = len(index) + for k, v in cols.iteritems(): + if k != INDEX and k != 'SYMBOL': + col_len = len(v) + if col_len < full_length: + v = ([None, ] * (full_length - col_len)) + v + assert len(v) == full_length + for i, arr in enumerate(v): + if arr is None: + # Replace Nones with appropriate-length empty arrays + v[i] = self._empty(len(index[i]), column_dtypes.get(k)) + else: + # Promote to appropriate dtype only if we can safely cast all the values + # This avoids the case with strings where None is cast as 'None'. + # Casting the object to a string is not worthwhile anyway as Pandas changes the + # dtype back to objectS + if (i == 0 or v[i].dtype != v[i - 1].dtype) and np.can_cast(v[i].dtype, column_dtypes[k], + casting='safe'): + v[i] = v[i].astype(column_dtypes[k], casting='safe') + + rtn[k] = v + return rtn + + def _set_or_promote_dtype(self, column_dtypes, c, dtype): + existing_dtype = column_dtypes.get(c) + if existing_dtype is None or existing_dtype != dtype: + # Promote ints to floats - as we can't easily represent NaNs + if np.issubdtype(dtype, int): + dtype = np.dtype('f8') + column_dtypes[c] = np.promote_types(column_dtypes.get(c, dtype), dtype) + + def _prepend_image(self, document, im): + image = im[IMAGE] + first_dt = im['t'] + if not first_dt.tzinfo: + first_dt = first_dt.replace(tzinfo=mktz('UTC')) + document[INDEX] = np.insert(document[INDEX], 0, np.uint64(datetime_to_ms(first_dt))) + for field in document: + if field == INDEX or document[field] is None: + continue + if field in image: + val = image[field] + else: + logger.debug("Field %s is missing from image!", field) + val = np.nan + document[field] = np.insert(document[field], 0, document[field].dtype.type(val)) + return document + + def _read_bucket(self, doc, columns, column_dtypes, include_symbol, include_images): + rtn = {} + if doc[VERSION] != 3: + raise ArcticException("Unhandled document version: %s" % doc[VERSION]) + rtn[INDEX] = np.cumsum(np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64')) + doc_length = len(rtn[INDEX]) + rtn_length = len(rtn[INDEX]) + if include_symbol: + rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length + columns.update(doc[COLUMNS].keys()) + for c in columns: + try: + coldata = doc[COLUMNS][c] + dtype = np.dtype(coldata[DTYPE]) + values = np.fromstring(lz4.decompress(str(coldata[DATA])), dtype=dtype) + self._set_or_promote_dtype(column_dtypes, c, dtype) + rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) + rowmask = np.unpackbits(np.fromstring(lz4.decompress(str(coldata[ROWMASK])), + dtype='uint8'))[:doc_length].astype('bool') + rtn[c][rowmask] = values + except KeyError: + rtn[c] = None + + if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): + rtn = self._prepend_image(rtn, doc[IMAGE_DOC]) + return rtn + + def _empty(self, length, dtype): + if dtype is not None and dtype == np.float64: + rtn = np.empty(length, dtype) + rtn[:] = np.nan + return rtn + else: + return np.empty(length, dtype=np.object_) + + def stats(self): + """ + Return storage statistics about the library + + Returns + ------- + dictionary of storage stats + """ + res = {} + db = self._collection.database + conn = db.connection + res['sharding'] = {} + try: + sharding = conn.config.databases.find_one({'_id': db.name}) + if sharding: + res['sharding'].update(sharding) + res['sharding']['collections'] = list(conn.config.collections.find( + {'_id': {'$regex': '^' + db.name + "\..*"}})) + except OperationFailure: + # Access denied + pass + res['dbstats'] = db.command('dbstats') + res['chunks'] = db.command('collstats', self._collection.name) + res['totals'] = {'count': res['chunks']['count'], + 'size': res['chunks']['size'], + } + return res + + def _assert_nonoverlapping_data(self, symbol, start, end): + # + # Imagine we're trying to insert a tick bucket like: + # |S------ New-B -------------- E| + # |---- 1 ----| |----- 2 -----| |----- 3 -----| + # + # S = New-B Start + # E = New-B End + # New-B overlaps with existing buckets 1,2,3 + # + # All we need to do is find the bucket who's start is immediately before (E) + # If that document's end is > S, then we know it overlaps + # with this bucket. + doc = self._collection.find_one({SYMBOL: symbol, + START: {'$lt': end} + }, + projection={START: 1, + END: 1, + '_id': 0}, + sort=[(START, pymongo.DESCENDING)]) + if doc: + if not doc[END].tzinfo: + doc[END] = doc[END].replace(tzinfo=mktz('UTC')) + if doc[END] > start: + raise OverlappingDataException("Document already exists with start:{} end:{} in the range of our start:{} end:{}".format( + doc[START], doc[END], start, end)) + + def write(self, symbol, data): + """ + Writes a list of market data events. + + Parameters + ---------- + symbol : `str` + symbol name for the item + data : list of dicts + List of ticks to store to the tick-store. + """ + pandas = False + # Check for overlapping data + if isinstance(data, list): + start = data[0]['index'] + end = data[-1]['index'] + elif isinstance(data, pd.DataFrame): + start = data.index[0].to_datetime() + end = data.index[-1].to_datetime() + pandas = True + else: + raise UnhandledDtypeException("Can't persist type %s to tickstore" % type(data)) + self._assert_nonoverlapping_data(symbol, self._to_dt(start), self._to_dt(end)) + + if pandas: + buckets = self._pandas_to_buckets(data, symbol) + else: + buckets = self._to_buckets(data, symbol) + self._write(buckets) + + def _write(self, buckets): + start = dt.now() + mongo_retry(self._collection.insert_many)(buckets) + t = (dt.now() - start).total_seconds() + ticks = len(buckets) * self.chunk_size + print "%d buckets in %s: approx %s ticks/sec" % (len(buckets), t, int(ticks / t)) + + def _pandas_to_buckets(self, x, symbol): + rtn = [] + for i in range(0, len(x), self.chunk_size): + rtn.append(self._pandas_to_bucket(x[i:i + self.chunk_size], symbol)) + return rtn + + def _to_buckets(self, x, symbol): + rtn = [] + for i in range(0, len(x), self.chunk_size): + rtn.append(self._to_bucket(x[i:i + self.chunk_size], symbol)) + return rtn + + def _to_ms(self, date): + if isinstance(date, dt): + logger.warn('WARNING: treating naive datetime as London in write path') + return datetime_to_ms(date) + return date + + def _to_dt(self, date, default_tz=None): + if isinstance(date, (int, long)): + return ms_to_datetime(date, mktz('UTC')) + elif date.tzinfo is None: + if default_tz is None: + raise ValueError("Must specify a TimeZone on incoming data") + # Treat naive datetimes as London + return date.replace(tzinfo=mktz()) + return date + + def _str_dtype(self, dtype): + """ + Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order. + """ + assert dtype.byteorder != '>' + if (dtype.kind) == 'i': + assert dtype.itemsize == 8 + return 'int64' + elif (dtype.kind) == 'f': + assert dtype.itemsize == 8 + return 'float64' + elif (dtype.kind) == 'U': + return 'U%d' % (dtype.itemsize / 4) + else: + raise UnhandledDtypeException("Bad dtype '%s'" % dtype) + + + def _ensure_supported_dtypes(self, array): + # We only support these types for now, as we need to read them in Java + if (array.dtype.kind) == 'i': + array = array.astype(' 1 or (len(library_metadata) == 1 and library_metadata[0] != library_name): + raise OverlappingDataException("""There are libraries that overlap with the date range: +library: {} +overlapping libraries: {}""".format(library_name, [l.library for l in library_metadata])) + self._collection.update_one({'library_name': library_name}, + {'$set': {'start': start, 'end': end}}, upsert=True) + + def read(self, symbol, date_range, columns=['BID', 'ASK', 'TRDPRC_1', 'BIDSIZE', 'ASKSIZE', 'TRDVOL_1'], **kwargs): + libraries = self._get_libraries(date_range) + dfs = [l.library.read(symbol, l.date_range.intersection(date_range), columns) for l in libraries] + return pd.concat(dfs) + + def write(self, symbol, data): + # get the full set of date ranges that we have + cursor = self._collection.find() + for res in cursor: + library = self._arctic_lib.arctic[res['library_name']] + dslice = self._slice(data, res['start'], res['end']) + if len(dslice) != 0: + library.write(symbol, dslice) + + def list_symbols(self, date_range): + libraries = self._get_libraries(date_range) + return sorted(list(set(itertools.chain(*[l.library.list_symbols() for l in libraries])))) + + def get_name(self): + name = self._arctic_lib.get_name() + if name.startswith(self._arctic_lib.DB_PREFIX + '_'): + name = name[len(self._arctic_lib.DB_PREFIX) + 1:] + return name + + def _get_libraries(self, date_range): + libraries = self._get_library_metadata(date_range) + + rtn = [TickStoreLibrary(self._arctic_lib.arctic[library.library], library.date_range) + for library in libraries] + current_start = rtn[-1].date_range.end if rtn else dt(1970, 1, 1, 0, 0) # epoch + if date_range.end is None or current_start < date_range.end: + name = self.get_name() + db_name, tick_type = name.split('.', 1) + current_lib = "{}_current.{}".format(db_name, tick_type) + try: + rtn.append(TickStoreLibrary(self._arctic_lib.arctic[current_lib], + DateRange(current_start, None, OPEN_OPEN))) + except LibraryNotFoundException: + pass # No '_current', move on. + + if not rtn: + raise NoDataFoundException("No underlying libraries exist for the given date range") + return rtn + + def _slice(self, data, start, end): + if isinstance(data, list): + dictlist = DictList(data, 'index') + slice_start = bisect.bisect_left(dictlist, start) + slice_end = bisect.bisect_right(dictlist, end) + return data[slice_start:slice_end] + elif isinstance(data, pd.DataFrame): + return data[start:end] + else: + raise UnhandledDtypeException("Can't persist type %s to tickstore" % type(data)) + + def _get_library_metadata(self, date_range): + """ + Retrieve the libraries for the given date range, the assumption is that the date ranges do not overlap and + they are CLOSED_CLOSED. + + At the moment the date range is mandatory + """ + if date_range is None: + raise Exception("A date range must be provided") + if not (date_range.start and date_range.end): + raise Exception("The date range {0} must contain a start and end date".format(date_range)) + + start = date_range.start if date_range.start.tzinfo is not None else date_range.start.replace(tzinfo=mktz()) + end = date_range.end if date_range.end.tzinfo is not None else date_range.end.replace(tzinfo=mktz()) + query = {'$or': [{'start': {'$lte': start}, 'end': {'$gte': start}}, + {'start': {'$gte': start}, 'end': {'$lte': end}}, + {'start': {'$lte': end}, 'end': {'$gte': end}}]} + return [TickStoreLibrary(res['library_name'], DateRange(res['start'], res['end'], CLOSED_CLOSED)) + for res in self._collection.find(query, + projection={'library_name': 1, + 'start': 1, 'end': 1}, + sort=[('start', pymongo.ASCENDING)])] diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..a15128ec8 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1 @@ +from ahl.pkgutils.sphinx.conf import * diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..33404c6ff --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,19 @@ +.. arctic documentation master file + +arctic +=============================== + +.. toctree:: + :maxdepth: 4 + + autodoc/arctic + +.. automodule:: arctic + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/howtos/how_to_custom_arctic_library.py b/howtos/how_to_custom_arctic_library.py new file mode 100644 index 000000000..c237c7296 --- /dev/null +++ b/howtos/how_to_custom_arctic_library.py @@ -0,0 +1,163 @@ +from datetime import datetime as dt +from bson.binary import Binary +import cPickle + +from arctic import Arctic, register_library_type +from arctic.decorators import mongo_retry + + +# +# Arctic maps a library, e.g. 'jblackburn.stuff' to a class instance +# which implements whatever API you like. +# +# Arctic provides a standard switching layer for: +# - Registering custom storage types (e.g. CustomArcticLibType) +# - Mapping data libraries to a storage type (e.g. 'jblackburn.stuff' -> CustomArcticLibType) +# - Handling Authentication +# - Maintaining per-library metadata +# - Quota +# + + +class Stuff(object): + """ + Some custom class persisted by our CustomArcticLibType Library Type + """ + def __init__(self, field1, date_field, stuff): + # Some string field + self.field1 = field1 + # Some date field + self.date_field = date_field + # Arbitrary other stuff + self.stuff = stuff + + +class CustomArcticLibType(object): + """ + Custom Arctic Library for storing 'Stuff' items + """ + + # Choose a library type name that's unique; e.g. .DataType + _LIBRARY_TYPE = 'test.CustomArcticLibType' + + def __init__(self, arctic_lib): + self._arctic_lib = arctic_lib + + # Arctic_lib gives you a root pymongo.Collection just-for-you: + # You may store all your data in here ... + self._collection = arctic_lib.get_top_level_collection() + # ... or you can create 'sub-collections', e.g. + self._sub_collection = self._collection.sub_collection + + # The name of this library + print "My name is %s" % arctic_lib.get_name() + + # Fetch some per-library metadata for this library + self.some_metadata = arctic_lib.get_library_metadata('some_metadata') + + @classmethod + def initialize_library(cls, arctic_lib, **kwargs): + # Persist some per-library metadata in this arctic_lib + arctic_lib.set_library_metadata('some_metadata', 'some_value') + CustomArcticLibType(arctic_lib)._ensure_index() + + def _ensure_index(self): + """ + Index any fields used by your queries. + """ + collection = self._collection + # collection.add_indexes + collection.create_index('field1') + + ########################################### + # Create your own API below! + ########################################### + + @mongo_retry + def query(self, *args, **kwargs): + """ + Generic query method. + + In reality, your storage class would have its own query methods, + + Performs a Mongo find on the Marketdata index metadata collection. + See: + http://api.mongodb.org/python/current/api/pymongo/collection.html + """ + for x in self._collection.find(*args, **kwargs): + x['stuff'] = cPickle.loads(x['stuff']) + del x['_id'] # Remove default unique '_id' field from doc + yield Stuff(**x) + + @mongo_retry + def stats(self): + """ + Database usage statistics. Used by quota. + """ + res = {} + db = self._collection.database + res['dbstats'] = db.command('dbstats') + res['data'] = db.command('collstats', self._collection.name) + res['totals'] = {'count': res['data']['count'], + 'size': res['data']['size'] + } + return res + + @mongo_retry + def store(self, thing): + """ + Simple persistence method + """ + to_store = {'field1': thing.field1, + 'date_field': thing.date_field, + } + to_store['stuff'] = Binary(cPickle.dumps(thing.stuff)) + # Respect any soft-quota on write - raises if stats().totals.size > quota + self._arctic_lib.check_quota() + self._collection.insert_one(to_store) + + @mongo_retry + def delete(self, query): + """ + Simple delete method + """ + self._collection.delete_one(query) + + +# Hook the class in for the type string 'CustomArcticLibType' +register_library_type(CustomArcticLibType._LIBRARY_TYPE, CustomArcticLibType) + +# Create a Arctic instance pointed at a mongo host +store = Arctic(mongo_host) + +### Initialize the library +# Map username.custom_lib -> CustomArcticLibType +store.initialize_library('username.custom_lib', CustomArcticLibType._LIBRARY_TYPE) + +# Now pull our username.custom_lib ; note that it has the: +# - query(...) +# - store(...) +# - delete(...) +# API we defined above +lib = store['username.custom_lib'] + + +# Store some items in the custom library type +lib.store(Stuff('thing', dt(2012, 1, 1), object())) +lib.store(Stuff('thing2', dt(2013, 1, 1), object())) +lib.store(Stuff('thing3', dt(2014, 1, 1), object())) +lib.store(Stuff(['a', 'b', 'c'], dt(2014, 1, 1), object())) + + +# Do some querying via our library's query method. +# You would have your own methods for querying here... (which use your index(es), of course) +list(lib.query()) # Get everything +list(lib.query({'field1': 'thing'})) # just get by name +list(lib.query({'field1': 'a'})) # Can query lists +list(lib.query({'field1': 'b'})) +list(lib.query({'date_field': {'$lt': dt(2013, 2, 2)}})) +list(lib.query({'field1':'thing', + 'date_field': {'$lt': dt(2013, 2, 2)} })) + +# Remove everything +lib.delete({}) diff --git a/howtos/how_to_use_arctic.py b/howtos/how_to_use_arctic.py new file mode 100644 index 000000000..3ee2d65c5 --- /dev/null +++ b/howtos/how_to_use_arctic.py @@ -0,0 +1,65 @@ +# +# Arctic Key-Value store +# + +from arctic import Arctic +from datetime import datetime as dt +import pandas as pd + + +# Connect to the mongo-host / cluster +store = Arctic(mongo_host) + +# Data is grouped into 'libraries'. +# Users may have one or more named libraries: +store.list_libraries() + +# Create a library +store.initialize_library('username.scratch') + +# Get a library +# library = m['username.'] +library = store['username.scratch'] + +# Store some data in the library +df = pd.DataFrame({'prices': [1, 2, 3]}, + [dt(2014, 1, 1), dt(2014, 1, 2), dt(2014, 1, 3)]) +library.write('SYMBOL', df) + +# Read some data from the library +# (Note the returned object has an associated version number and metadata.) +library.read('SYMBOL') + +# Store some data into the library +library.write('MY_DATA', library.read('SYMBOL').data) + +# What symbols (keys) are stored in the library +library.list_symbols() + +# Delete the data item +library.delete('MY_DATA') + + +# Other library functionality + +# Store 'metadata' alongside a data item +library.write('MY_DATA', library.read('SYMBOL').data, metadata={'some_key': 'some_value'}) + +# Query avaialable symbols based on metadata +library.list_symbols(some_key='some_value') + +# Find available versions of a symbol +list(library.list_versions('SYMBOL')) + +# Snapshot a library +# (Point-in-time named reference for all symbols in a library.) +library.snapshot('snapshot_name') +library.list_snapshots() + +# Get an old version of a symbol +library.read('SYMBOL', as_of=1) +# Geta version given a snapshot name +library.read('SYMBOL', as_of='snapshot_name') + +# Delete a snapshot +library.delete_snapshot('snapshot_name') diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..f6f7961d4 --- /dev/null +++ b/setup.py @@ -0,0 +1,123 @@ +# +# Copyright (C) 2015 Man AHL +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 +# USA + +import os +from setuptools import setup, Extension +from setuptools.command.test import test as TestCommand + + +# Utility function to read the README file. +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +class PyTest(TestCommand): + user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] + + def initialize_options(self): + TestCommand.initialize_options(self) + self.pytest_args = [] + + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run_tests(self): + # import here, cause outside the eggs aren't loaded + import pytest + args = [self.pytest_args] if isinstance(self.pytest_args, basestring) else list(self.pytest_args) + args.extend(['--cov', 'arctic', + '--cov-report', 'xml', + '--cov-report', 'html', + '--junitxml', 'junit.xml' + ]) + errno = pytest.main(args) + sys.exit(errno) + + +# setuptools_cython: setuptools DWIM monkey-patch madness +# http://mail.python.org/pipermail/distutils-sig/2007-September/thread.html#8204 +import sys +if 'setuptools.extension' in sys.modules: + m = sys.modules['setuptools.extension'] + m.Extension.__dict__ = m._Extension.__dict__ + +# Cython lz4 +compress = Extension('arctic._compress', + sources=["src/_compress.pyx", "src/lz4.c", "src/lz4hc.c"], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) + +setup( + name="arctic", + version="1.0.0", + author="Man AHL Technology", + author_email="ManAHLTech@ahl.com", + description=("AHL Research Versioned TimeSeries and Tick store"), + license="GPL", + keywords=["ahl", "keyvalue", "tickstore", "mongo", "timeseries", ], + url="https://github.com/ahlmss/arctic", + packages=['arctic', 'tests'], + long_description="", # read('README'), + cmdclass={'test': PyTest}, + ext_modules=[compress], + setup_requires=["setuptools_cython", + "Cython", + "numpy", + ], + install_requires=["decorator", + "enum34", + "lz4", + "mockextras", + "pandas", + "pymongo>=3.0", + "python-dateutil", + "pytz", + "tzlocal", + ], + tests_require=["mock", + "mockextras", + "pytest", + "pytest-cov", + "pytest-dbfixtures", + "pytest-timeout", + "pytest-xdist", + ], + entry_points={'console_scripts': [ + 'arctic_init_library = arctic.scripts.arctic_init_library:main', + 'arctic_list_libraries = arctic.scripts.arctic_list_libraries:main', + 'arctic_delete_library = arctic.scripts.arctic_delete_library:main', + 'arctic_enable_sharding = arctic.scripts.arctic_enable_sharding:main', + 'arctic_copy_data = arctic.scripts.arctic_copy_data:main', + 'arctic_create_user = arctic.scripts.arctic_create_user:main', + 'arctic_prune_versions = arctic.scripts.arctic_prune_versions:main', + 'arctic_fsck = arctic.scripts.arctic_fsck:main', + ] + }, + classifiers=[ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Cython", + "Topic :: Database", + "Topic :: Database :: Front-Ends", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/src/_compress.pyx b/src/_compress.pyx new file mode 100644 index 000000000..c7e517c0a --- /dev/null +++ b/src/_compress.pyx @@ -0,0 +1,246 @@ +# cython: profile=True + +# +# LZ4 code was copied from: https://github.com/steeve/python-lz4/ r8ac9cf9df8fb8d51f40a3065fa538f8df1c8a62a 22/4/2015 [tt] +# + +cdef extern from "lz4.h": + cdef int LZ4_compress(char* source, char* dest, int inputSize) nogil + cdef int LZ4_compressBound(int isize) nogil + cdef int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxOutputSize) nogil + +cdef extern from "lz4hc.h": + cdef int LZ4_compressHC(char* source, char* dest, int inputSize) nogil + +cimport cython +cimport cpython +cimport libc.stdio +cimport openmp + +from libc.stdlib cimport malloc, free, realloc +from libc.stdint cimport uint8_t, uint32_t +from libc.stdio cimport printf +from cpython.string cimport PyString_AsString +from cython.view cimport array as cvarray +from cython.parallel import prange +from cython.parallel import threadid +from cython.parallel cimport parallel + +cdef void store_le32(char *c, uint32_t x) nogil: + c[0] = x & 0xff + c[1] = (x >> 8) & 0xff + c[2] = (x >> 16) & 0xff + c[3] = (x >> 24) & 0xff + +cdef uint32_t load_le32(char *c) nogil: + cdef uint8_t *d = c + return d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24) + + +cdef int hdr_size = sizeof(uint32_t) + +cdef char ** to_cstring_array(list_str): + """ Convert a python string list to a **char + Note: Performs a malloc. You must free the array once created. + """ + cdef char **ret = malloc(len(list_str) * sizeof(char *)) + for i in xrange(len(list_str)): + ret[i] = PyString_AsString(list_str[i]) + return ret + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def compress(pString): + return _compress(pString, LZ4_compress) + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def compressHC(pString): + return _compress(pString, LZ4_compressHC) + + +cdef _compress(pString, int (*Fnptr_LZ4_compress)(char *, char *, int)): + # sizes + cdef uint32_t compressed_size + cdef uint32_t original_size = len(pString) + + # buffers + cdef char *cString = pString + cdef char *result # destination buffer + cdef bytes pyResult # python wrapped result + + # calc. estaimted compresed size + compressed_size = LZ4_compressBound(original_size) + # alloc memory + result = malloc(compressed_size + hdr_size) + # store original size + store_le32(result, original_size); + # compress & update size + compressed_size = Fnptr_LZ4_compress(cString, result + hdr_size, original_size) + # cast back into a python sstring + pyResult = result[:compressed_size + hdr_size] + + free(result) + + return pyResult + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def decompress(pString): + + # sizes + cdef uint32_t compressed_size = len(pString) + cdef uint32_t original_size + + # buffers + cdef char *cString # *char pStr + cdef char *result # destination buffer + cdef bytes pyResult # python wrapped result + + # convert to char* + cString = pString + # find original size + original_size = load_le32(cString) + # malloc + result = malloc(original_size) + # decompress + LZ4_decompress_safe(cString + hdr_size, result, compressed_size - hdr_size, original_size) + # cast back into python string + pyResult = result[:original_size] + + free(result) + return pyResult + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def compressarr(pStrList): + return _compressarr(pStrList, LZ4_compress) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def compressarrHC(pStrList): + return _compressarr(pStrList, LZ4_compressHC) + + +cdef _compressarr(pStrList, int (*Fnptr_LZ4_compress)(char *, char *, int) nogil): + + if len(pStrList) == 0: + return [] + + cdef char **cStrList = to_cstring_array(pStrList) + cdef Py_ssize_t n = len(pStrList) + + # loop parameters + cdef char *cString + cdef int original_size + cdef uint32_t compressed_size + cdef char *result + cdef Py_ssize_t i + + # output parameters + cdef char **cResult = malloc(n * sizeof(char *)) + cdef int[:] lengths = cvarray(shape=(n,), itemsize=sizeof(int), format="i") + cdef int[:] orilengths = cvarray(shape=(n,), itemsize=sizeof(int), format="i") + cdef bytes pyResult + + # store original string lengths + for i in range(n): + orilengths[i] = len(pStrList[i]) + + with nogil, parallel(): + for i in prange(n, schedule='static'): + cString = cStrList[i] + original_size = orilengths[i] + # calc. estaimted compresed size + compressed_size = LZ4_compressBound(original_size) + # alloc memory + result = malloc(compressed_size + hdr_size) + # store original size + store_le32(result, original_size) + # compress & update size + compressed_size = Fnptr_LZ4_compress(cString, result + hdr_size, original_size) + # assign to result + lengths[i] = compressed_size + hdr_size + cResult[i] = result + + # cast back to python + result_list = [] + for i in range(n): + pyResult = cResult[i][:lengths[i]] + free(cResult[i]) + result_list.append(pyResult) + + free(cResult) + free(cStrList) + + return result_list + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.nonecheck(False) +@cython.cdivision(True) +def decompressarr(pStrList): + + if len(pStrList) == 0: + return [] + + cdef char **cStrList = to_cstring_array(pStrList) + cdef Py_ssize_t n = len(pStrList) + + # loop parameters + cdef char *cString + cdef uint32_t original_size + cdef uint32_t compressed_size + cdef char *result + cdef Py_ssize_t i + + # output parameters + cdef char **cResult = malloc(n * sizeof(char *)) + cdef int[:] clengths = cvarray(shape=(n,), itemsize=sizeof(int), format="i") + cdef int[:] lengths = cvarray(shape=(n,), itemsize=sizeof(int), format="i") + cdef bytes pyResult + + for i in range(n): + clengths[i] = len(pStrList[i]) + + with nogil, parallel(): + for i in prange(n, schedule='static'): + cString = cStrList[i] + # get compressed size + compressed_size = clengths[i] + # find original size + original_size = load_le32(cString) + # malloc + result = malloc(original_size) + # decompress + LZ4_decompress_safe(cString + hdr_size, result, compressed_size - hdr_size, original_size) + # assign to result + cResult[i] = result + lengths[i] = original_size + + # cast back to python + result_list = [] + for i in range(n): + pyResult = cResult[i][:lengths[i]] + free(cResult[i]) + result_list.append(pyResult) + + free(cResult) + free(cStrList) + + return result_list diff --git a/src/lz4.c b/src/lz4.c new file mode 100644 index 000000000..b900f7a09 --- /dev/null +++ b/src/lz4.c @@ -0,0 +1,1247 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/************************************** + Tuning parameters +**************************************/ +/* + * HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). + */ +#define HEAPMODE 0 + + +/************************************** + CPU Feature Detection +**************************************/ +/* 32 or 64 bits ? */ +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__powerpc64le__) \ + || defined(__ppc64__) || defined(__ppc64le__) \ + || defined(__PPC64__) || defined(__PPC64LE__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) /* Detects 64 bits mode */ +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +/* + * Little Endian or Big Endian ? + * Overwrite the #define below if you know your architecture endianess + */ +#include /* Apparently required to detect endianess */ +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */ +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, such as x86. + * For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property + * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* Define this parameter if your target system or compiler does not support hardware bit count */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + +/* + * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : + * This option may provide a small boost to performance for some big endian cpu, although probably modest. + * You may set this option to 1 if data will remain within closed environment. + * This option is useless on Little_Endian CPU (such as x86) + */ + +/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ + + +/************************************** + Compiler Options +**************************************/ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +/* "restrict" is a known keyword */ +#else +# define restrict /* Disable restrict */ +#endif + +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include /* For Visual 2005 */ +# if LZ4_ARCH64 /* 64-bits */ +# pragma intrinsic(_BitScanForward64) /* For Visual 2005 */ +# pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */ +# else /* 32-bits */ +# pragma intrinsic(_BitScanForward) /* For Visual 2005 */ +# pragma intrinsic(_BitScanReverse) /* For Visual 2005 */ +# endif +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER /* Visual Studio */ +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/************************************** + Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/************************************** + Includes +**************************************/ +#include "lz4.h" + + +/************************************** + Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct { U16 v; } _PACKED U16_S; +typedef struct { U32 v; } _PACKED U32_S; +typedef struct { U64 v; } _PACKED U64_S; +typedef struct {size_t v;} _PACKED size_t_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(0) +# else +# pragma pack(pop) +# endif +#endif + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) +#define AARCH(x) (((size_t_S *)(x))->v) + + +/************************************** + Constants +**************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define HASH_SIZE_U32 (1 << LZ4_HASHLOG) + +#define MINMATCH 4 + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +static const int LZ4_minLength = (MFLIMIT+1); + +#define KB *(1U<<10) +#define MB *(1U<<20) +#define GB *(1U<<30) + +#define LZ4_64KLIMIT ((64 KB) + (MFLIMIT-1)) +#define SKIPSTRENGTH 6 /* Increasing this value will make the compression run slower on incompressible data */ + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<=e; */ +#else +# define LZ4_WILDCOPY(d,s,e) { if (likely(e-d <= 8)) LZ4_COPY8(d,s) else do { LZ4_COPY8(d,s) } while (d>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif +# endif +} + +#else + +int LZ4_NbCommonBytes (register U32 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +# endif +} + +#endif + + +/******************************** + Compression functions +********************************/ +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } + +static int LZ4_hashSequence(U32 sequence, tableType_t tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + } +} + +static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIndictSize; + const BYTE* const dictionary = dictPtr->dictionary; + const BYTE* const dictEnd = dictionary + dictPtr->dictSize; + const size_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + size_t refDelta=0; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source - dictPtr->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=(int)LZ4_64KLIMIT)) return 0; /* Size too large (not within 64K limit) */ + if (inputSize> skipStrength; + //if (step>8) step=8; // required for valid forwardIp ; slows down uncompressible data a bit + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + ref = LZ4_getPositionOnHash(h, ctx, tableType, base); + if (dict==usingExtDict) + { + if (ref<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (ref < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (ref + MAX_DISTANCE < ip)) + || (A32(ref+refDelta) != A32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (ref+refDelta > lowLimit) && (unlikely(ip[-1]==ref[refDelta-1]))) { ip--; ref--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; /* Check output limit */ + if (litLength>=RUN_MASK) + { + int len = (int)litLength-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, limit); + ip += MINMATCH + matchLength; + if (ip==limit) + { + unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchLength += more; + ip += more; + } + } + else + { + matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, matchlimit); + ip += MINMATCH + matchLength; + } + + if (matchLength>=ML_MASK) + { + if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) + return 0; /* Check output limit */ + *token += ML_MASK; + matchLength -= ML_MASK; + for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } + if (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + ref = LZ4_getPosition(ip, ctx, tableType, base); + if (dict==usingExtDict) + { + if (ref<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + LZ4_putPosition(ip, ctx, tableType, base); + if ( ((dictIssue==dictSmall) ? (ref>=lowRefLimit) : 1) + && (ref+MAX_DISTANCE>=ip) + && (A32(ref+refDelta)==A32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + int lastRun = (int)(iend - anchor); + if ((outputLimited) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) + return 0; /* Check output limit */ + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun<= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + if (dict->initCheck) MEM_INIT(dict, 0, sizeof(LZ4_stream_t_internal)); /* Uninitialized structure detected */ + + if (dictSize < MINMATCH) + { + dict->dictionary = NULL; + dict->dictSize = 0; + return 1; + } + + if (p <= dictEnd - 64 KB) p = dictEnd - 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-MINMATCH) + { + LZ4_putPosition(p, dict, byU32, base); + p+=3; + } + + return 1; +} + + +void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ + { + /* rescale hash table */ + U32 delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +FORCE_INLINE int LZ4_compress_continue_generic (void* LZ4_stream, const char* source, char* dest, int inputSize, + int maxOutputSize, limitedOutput_directive limit) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + + /* Check overlapping input/dictionary space */ + { + const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) + { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, dictSmall); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, noDictIssue); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, dictSmall); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, noDictIssue); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +int LZ4_compress_continue (void* LZ4_stream, const char* source, char* dest, int inputSize) +{ + return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, 0, notLimited); +} + +int LZ4_compress_limitedOutput_continue (void* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput); +} + + +// Hidden debug function, to force separate dictionary mode +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); + + result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +int LZ4_saveDict (void* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memcpy(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return 1; +} + + + +/**************************** + Decompression functions +****************************/ +/* + * This generic decompression function cover all use cases. + * It shall be instanciated several times, using different sets of directives + * Note that it is essential this generic function is really inlined, + * in order to remove useless branches during compilation optimisation. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* source, + char* dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const char* dictStart, /* only if dict==usingExtDict */ + int dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* restrict ip = (const BYTE*) source; + const BYTE* ref; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = (const BYTE*)dest - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; +//#define OLD +#ifdef OLD + const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; /* static reduces speed for LZ4_decompress_safe() on GCC64 */ +#else + const size_t dec32table[] = {4-0, 4-3, 4-2, 4-3, 4-0, 4-0, 4-0, 4-0}; /* static reduces speed for LZ4_decompress_safe() on GCC64 */ +#endif + static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + const int checkOffset = (endOnInput) && (dictSize < (int)(64 KB)); + + + /* Special cases */ + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + + /* Main Loop */ + while (1) + { + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s; + do + { + s = *ip++; + length += s; + } + while (likely((endOnInput)?ipLZ4_MAX_INPUT_SIZE)) goto _output_error; /* overflow detection */ + if ((sizeof(void*)==4) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error; /* quickfix issue 134 */ + if ((endOnInput) && (sizeof(void*)==4) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error; /* quickfix issue 134 */ + } + + /* copy literals */ + cpy = op+length; + if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if ((checkOffset) && (unlikely(ref < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ + + /* get matchlength */ + if ((length=(token&ML_MASK)) == ML_MASK) + { + unsigned s; + do + { + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + s = *ip++; + length += s; + } while (s==255); + //if ((sizeof(void*)==4) && unlikely(length>LZ4_MAX_INPUT_SIZE)) goto _output_error; /* overflow detection */ + if ((sizeof(void*)==4) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* quickfix issue 134 */ + } + + /* check external dictionary */ + if ((dict==usingExtDict) && (ref < (BYTE* const)dest)) + { + if (unlikely(op+length+MINMATCH > oend-LASTLITERALS)) goto _output_error; + + if (length+MINMATCH <= (size_t)(dest-(char*)ref)) + { + ref = dictEnd - (dest-(char*)ref); + memcpy(op, ref, length+MINMATCH); + op += length+MINMATCH; + } + else + { + size_t copySize = (size_t)(dest-(char*)ref); + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + copySize = length+MINMATCH - copySize; + if (copySize > (size_t)((char*)op-dest)) /* overlap */ + { + BYTE* const cpy = op + copySize; + const BYTE* ref = (BYTE*)dest; + while (op < cpy) *op++ = *ref++; + } + else + { + memcpy(op, dest, copySize); + op += copySize; + } + } + continue; + } + + /* copy repeated sequence */ + if (unlikely((op-ref)<(int)STEPSIZE)) + { + const size_t dec64 = dec64table[(sizeof(void*)==4) ? 0 : op-ref]; + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; +#ifdef OLD + op += 4, ref += 4; ref -= dec32table[op-ref]; + A32(op) = A32(ref); + op += STEPSIZE-4; ref -= dec64; +#else + ref += dec32table[op-ref]; + A32(op+4) = A32(ref); + op += STEPSIZE; ref -= dec64; +#endif + } else { LZ4_COPYSTEP(op,ref); } + cpy = op + length - (STEPSIZE-4); + + if (unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4))) + { + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last 5 bytes must be literals */ + if (opdictionary = dictionary; + lz4sd->dictSize = dictSize; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setDictDecode() +*/ +int LZ4_decompress_safe_continue (void* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize); + if (result <= 0) return result; + if (lz4sd->dictionary + lz4sd->dictSize == dest) + { + lz4sd->dictSize += result; + } + else + { + lz4sd->dictionary = dest; + lz4sd->dictSize = result; + } + + return result; +} + +int LZ4_decompress_fast_continue (void* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize); + if (result <= 0) return result; + if (lz4sd->dictionary + lz4sd->dictSize == dest) + { + lz4sd->dictSize += result; + } + else + { + lz4sd->dictionary = dest; + lz4sd->dictSize = result; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, dictStart, dictSize); +} + + +/*************************************************** + Obsolete Functions +***************************************************/ +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +void LZ4_init(LZ4_stream_t_internal* lz4ds, const BYTE* base) +{ + MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); + lz4ds->bufferStart = base; +} + +int LZ4_resetStreamState(void* state, const char* inputBuffer) +{ + if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t_internal*)state, (const BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (const char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(4, LZ4_STREAMSIZE_U32); + LZ4_init ((LZ4_stream_t_internal*)lz4ds, (const BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* lz4ds = (LZ4_stream_t_internal*)LZ4_Data; + + LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)lz4ds->bufferStart, 64 KB); + + return (char*)(lz4ds->bufferStart + 64 KB); +} + +/* Obsolete compresson functions using User-allocated state */ + +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + +int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize) +{ + if (((size_t)(state)&3) != 0) return 0; /* Error : state is not aligned on 4-bytes boundary */ + MEM_INIT(state, 0, LZ4_STREAMSIZE); + + if (inputSize < (int)LZ4_64KLIMIT) + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue); + else + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue); +} + +int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize) +{ + if (((size_t)(state)&3) != 0) return 0; /* Error : state is not aligned on 4-bytes boundary */ + MEM_INIT(state, 0, LZ4_STREAMSIZE); + + if (inputSize < (int)LZ4_64KLIMIT) + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue); + else + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 64 KB); +} \ No newline at end of file diff --git a/src/lz4.h b/src/lz4.h new file mode 100644 index 000000000..1064fa115 --- /dev/null +++ b/src/lz4.h @@ -0,0 +1,306 @@ +/* + LZ4 - Fast LZ compression algorithm + Header File + Copyright (C) 2011-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + + +/************************************** + Version +**************************************/ +#define LZ4_VERSION_MAJOR 1 /* for major interface/format changes */ +#define LZ4_VERSION_MINOR 2 /* for minor interface/format changes */ +#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ + + +/************************************** + Tuning parameter +**************************************/ +/* + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#define LZ4_MEMORY_USAGE 14 + + +/************************************** + Simple Functions +**************************************/ + +int LZ4_compress (const char* source, char* dest, int inputSize); +int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxOutputSize); + +/* +LZ4_compress() : + Compresses 'inputSize' bytes from 'source' into 'dest'. + Destination buffer must be already allocated, + and must be sized to handle worst cases situations (input data not compressible) + Worst case size evaluation is provided by function LZ4_compressBound() + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + return : the number of bytes written in buffer dest + or 0 if the compression fails + +LZ4_decompress_safe() : + compressedSize : is obviously the source size + maxOutputSize : is the size of the destination buffer, which must be already allocated. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + If the destination buffer is not large enough, decoding will stop and output an error code (<0). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function is protected against buffer overflow exploits : + it never writes outside of output buffer, and never reads outside of input buffer. + Therefore, it is protected against malicious data packets. +*/ + + +/* +Note : + Should you prefer to explicitly allocate compression-table memory using your own allocation method, + use the streaming functions provided below, simply reset the memory area between each call to LZ4_compress_continue() +*/ + + +/************************************** + Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) + primarily useful for memory allocation of output buffer. + macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). + + isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ +int LZ4_compressBound(int isize); + + +/* +LZ4_compress_limitedOutput() : + Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. + If it cannot achieve it, compression will stop, and result of the function will be zero. + This function never writes outside of provided output buffer. + + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxOutputSize : is the size of the destination buffer (which must be already allocated) + return : the number of bytes written in buffer 'dest' + or 0 if the compression fails +*/ +int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); + + +/* +LZ4_decompress_fast() : + originalSize : is the original and therefore uncompressed size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is malformed, the function will stop decoding and return a negative result. + Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. + note : This function is a bit faster than LZ4_decompress_safe() + It provides fast decompression and fully respect memory boundaries for properly formed compressed data. + It does not provide full protection against intentionnally modified data stream. + Use this function in a trusted environment (data to decode comes from a trusted source). +*/ +int LZ4_decompress_fast (const char* source, char* dest, int originalSize); + + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'compressedSize' at position 'source' + into output buffer 'dest' of size 'maxOutputSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ +int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxOutputSize); + + +/*********************************************** + Experimental Streaming Compression Functions +***********************************************/ + +#define LZ4_STREAMSIZE_U32 ((1 << (LZ4_MEMORY_USAGE-2)) + 8) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U32 * sizeof(unsigned int)) +/* + * LZ4_stream_t + * information structure to track an LZ4 stream. + * important : set this structure content to zero before first use ! + */ +typedef struct { unsigned int table[LZ4_STREAMSIZE_U32]; } LZ4_stream_t; + +/* + * If you prefer dynamic allocation methods, + * LZ4_createStream + * provides a pointer (void*) towards an initialized LZ4_stream_t structure. + * LZ4_free just frees it. + */ +void* LZ4_createStream(); +int LZ4_free (void* LZ4_stream); + + +/* + * LZ4_loadDict + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed (same effect as init). + * Return : 1 if OK, 0 if error + */ +int LZ4_loadDict (void* LZ4_stream, const char* dictionary, int dictSize); + +/* + * LZ4_compress_continue + * Compress data block 'source', using blocks compressed before as dictionary to improve compression ratio + * Previous data blocks are assumed to still be present at their previous location. + */ +int LZ4_compress_continue (void* LZ4_stream, const char* source, char* dest, int inputSize); + +/* + * LZ4_compress_limitedOutput_continue + * Same as before, but also specify a maximum target compressed size (maxOutputSize) + * If objective cannot be met, compression exits, and returns a zero. + */ +int LZ4_compress_limitedOutput_continue (void* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* + * LZ4_saveDict + * If previously compressed data block is not guaranteed to remain at its previous memory location + * save it into a safe place (char* safeBuffer) + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call again LZ4_compress_continue() + * Return : 1 if OK, 0 if error + * Note : any dictSize > 64 KB will be interpreted as 64KB. + */ +int LZ4_saveDict (void* LZ4_stream, char* safeBuffer, int dictSize); + + +/************************************************ + Experimental Streaming Decompression Functions +************************************************/ + +#define LZ4_STREAMDECODESIZE_U32 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int)) +/* + * LZ4_streamDecode_t + * information structure to track an LZ4 stream. + * important : set this structure content to zero before first use ! + */ +typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t; + +/* + * If you prefer dynamic allocation methods, + * LZ4_createStreamDecode() + * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure. + * LZ4_free just frees it. + */ +void* LZ4_createStreamDecode(); +int LZ4_free (void* LZ4_stream); /* yes, it's the same one as for compression */ + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setDictDecode() +*/ +int LZ4_decompress_safe_continue (void* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize); +int LZ4_decompress_fast_continue (void* LZ4_streamDecode, const char* source, char* dest, int originalSize); + +/* + * LZ4_setDictDecode + * Use this function to instruct where to find the dictionary. + * This function can be used to specify a static dictionary, + * or to instruct where to find some previously decoded data saved into a different memory space. + * Setting a size of 0 is allowed (same effect as no dictionary). + * Return : 1 if OK, 0 if error + */ +int LZ4_setDictDecode (void* LZ4_streamDecode, const char* dictionary, int dictSize); + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as + a combination of LZ4_setDictDecode() followed by LZ4_decompress_x_continue() + all together into a single function call. + It doesn't use nor update an LZ4_streamDecode_t structure. +*/ +int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize); +int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); + + + + +/************************************** + Obsolete Functions +**************************************/ +/* +Obsolete decompression functions +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is the same as LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize); +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); + +/* Obsolete functions for externally allocated state; use streaming interface instead */ +int LZ4_sizeofState(void); +int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete streaming functions; use new streaming interface whenever possible */ +void* LZ4_create (const char* inputBuffer); +int LZ4_sizeofStreamState(void); +int LZ4_resetStreamState(void* state, const char* inputBuffer); +char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int compressedSize, int maxOutputSize); +int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int originalSize); + + +#if defined (__cplusplus) +} +#endif diff --git a/src/lz4hc.c b/src/lz4hc.c new file mode 100755 index 000000000..608674902 --- /dev/null +++ b/src/lz4hc.c @@ -0,0 +1,892 @@ +/* + LZ4 HC - High Compression Mode of LZ4 + Copyright (C) 2011-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + - LZ4 source repository : http://code.google.com/p/lz4/ +*/ + + + +/************************************** + Tuning Parameter +**************************************/ +#define LZ4HC_DEFAULT_COMPRESSIONLEVEL 8 + + +/************************************** + Memory routines +**************************************/ +#include /* calloc, free */ +#define ALLOCATOR(s) calloc(1,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/************************************** + CPU Feature Detection +**************************************/ +/* 32 or 64 bits ? */ +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__powerpc64le__) \ + || defined(__ppc64__) || defined(__ppc64le__) \ + || defined(__PPC64__) || defined(__PPC64LE__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) /* Detects 64 bits mode */ +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +/* + * Little Endian or Big Endian ? + * Overwrite the #define below if you know your architecture endianess + */ +#include /* Apparently required to detect endianess */ +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */ +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, such as x86. + * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected + * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* Define this parameter if your target system or compiler does not support hardware bit count */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/************************************** + Compiler Options +**************************************/ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +/* "restrict" is a known keyword */ +#else +# define restrict /* Disable restrict */ +#endif + +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include /* For Visual 2005 */ +# if LZ4_ARCH64 /* 64-bits */ +# pragma intrinsic(_BitScanForward64) /* For Visual 2005 */ +# pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */ +# else /* 32-bits */ +# pragma intrinsic(_BitScanForward) /* For Visual 2005 */ +# pragma intrinsic(_BitScanReverse) /* For Visual 2005 */ +# endif +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable used */ +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER /* Visual Studio */ +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + + +/************************************** + Includes +**************************************/ +#include "lz4hc.h" +#include "lz4.h" + + +/************************************** + Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# ifdef __IBMC__ +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct _U16_S { U16 v; } _PACKED U16_S; +typedef struct _U32_S { U32 v; } _PACKED U32_S; +typedef struct _U64_S { U64 v; } _PACKED U64_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# pragma pack(pop) +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + + +/************************************** + Constants +**************************************/ +#define MINMATCH 4 + +#define DICTIONARY_LOGSIZE 16 +#define MAXD (1<> ((MINMATCH*8)-HASH_LOG)) +#define HASH_VALUE(p) HASH_FUNCTION(A32(p)) +#define HASH_POINTER(p) (HashTable[HASH_VALUE(p)] + base) +#define DELTANEXT(p) chainTable[(size_t)(p) & MAXD_MASK] +#define GETNEXT(p) ((p) - (size_t)DELTANEXT(p)) + + +/************************************** + Private functions +**************************************/ +#if LZ4_ARCH64 + +FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) +{ +#if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +#else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; +# endif +#endif +} + +#else + +FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +#else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +#endif +} + +#endif + + +int LZ4_sizeofStreamStateHC() +{ + return sizeof(LZ4HC_Data_Structure); +} + +FORCE_INLINE void LZ4_initHC (LZ4HC_Data_Structure* hc4, const BYTE* base) +{ + MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable)); + MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable)); + hc4->nextToUpdate = base + 1; + hc4->base = base; + hc4->inputBuffer = base; + hc4->end = base; +} + +int LZ4_resetStreamStateHC(void* state, const char* inputBuffer) +{ + if ((((size_t)state) & (sizeof(void*)-1)) != 0) return 1; /* Error : pointer is not aligned for pointer (32 or 64 bits) */ + LZ4_initHC((LZ4HC_Data_Structure*)state, (const BYTE*)inputBuffer); + return 0; +} + + +void* LZ4_createHC (const char* inputBuffer) +{ + void* hc4 = ALLOCATOR(sizeof(LZ4HC_Data_Structure)); + LZ4_initHC ((LZ4HC_Data_Structure*)hc4, (const BYTE*)inputBuffer); + return hc4; +} + + +int LZ4_freeHC (void* LZ4HC_Data) +{ + FREEMEM(LZ4HC_Data); + return (0); +} + + +/* Update chains up to ip (excluded) */ +FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip) +{ + U16* chainTable = hc4->chainTable; + HTYPE* HashTable = hc4->hashTable; + INITBASE(base,hc4->base); + + while(hc4->nextToUpdate < ip) + { + const BYTE* const p = hc4->nextToUpdate; + size_t delta = (p) - HASH_POINTER(p); + if (delta>MAX_DISTANCE) delta = MAX_DISTANCE; + DELTANEXT(p) = (U16)delta; + HashTable[HASH_VALUE(p)] = (HTYPE)((p) - base); + hc4->nextToUpdate++; + } +} + + +char* LZ4_slideInputBufferHC(void* LZ4HC_Data) +{ + LZ4HC_Data_Structure* hc4 = (LZ4HC_Data_Structure*)LZ4HC_Data; + U32 distance = (U32)(hc4->end - hc4->inputBuffer) - 64 KB; + distance = (distance >> 16) << 16; /* Must be a multiple of 64 KB */ + LZ4HC_Insert(hc4, hc4->end - MINMATCH); + memcpy((void*)(hc4->end - 64 KB - distance), (const void*)(hc4->end - 64 KB), 64 KB); + hc4->nextToUpdate -= distance; + hc4->base -= distance; + if ((U32)(hc4->inputBuffer - hc4->base) > 1 GB + 64 KB) /* Avoid overflow */ + { + int i; + hc4->base += 1 GB; + for (i=0; ihashTable[i] -= 1 GB; + } + hc4->end -= distance; + return (char*)(hc4->end); +} + + +FORCE_INLINE size_t LZ4HC_CommonLength (const BYTE* p1, const BYTE* p2, const BYTE* const matchlimit) +{ + const BYTE* p1t = p1; + + while (p1tchainTable; + HTYPE* const HashTable = hc4->hashTable; + const BYTE* ref; + INITBASE(base,hc4->base); + int nbAttempts=maxNbAttempts; + size_t repl=0, ml=0; + U16 delta=0; /* useless assignment, to remove an uninitialization warning */ + + /* HC4 match finder */ + LZ4HC_Insert(hc4, ip); + ref = HASH_POINTER(ip); + +#define REPEAT_OPTIMIZATION +#ifdef REPEAT_OPTIMIZATION + /* Detect repetitive sequences of length <= 4 */ + if ((U32)(ip-ref) <= 4) /* potential repetition */ + { + if (A32(ref) == A32(ip)) /* confirmed */ + { + delta = (U16)(ip-ref); + repl = ml = LZ4HC_CommonLength(ip+MINMATCH, ref+MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref = GETNEXT(ref); + } +#endif + + while (((U32)(ip-ref) <= MAX_DISTANCE) && (nbAttempts)) + { + nbAttempts--; + if (*(ref+ml) == *(ip+ml)) + if (A32(ref) == A32(ip)) + { + size_t mlt = LZ4HC_CommonLength(ip+MINMATCH, ref+MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { ml = mlt; *matchpos = ref; } + } + ref = GETNEXT(ref); + } + +#ifdef REPEAT_OPTIMIZATION + /* Complete table */ + if (repl) + { + const BYTE* ptr = ip; + const BYTE* end; + + end = ip + repl - (MINMATCH-1); + while(ptr < end-delta) + { + DELTANEXT(ptr) = delta; /* Pre-Load */ + ptr++; + } + do + { + DELTANEXT(ptr) = delta; + HashTable[HASH_VALUE(ptr)] = (HTYPE)((ptr) - base); /* Head of chain */ + ptr++; + } while(ptr < end); + hc4->nextToUpdate = end; + } +#endif + + return (int)ml; +} + + +FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch (LZ4HC_Data_Structure* hc4, const BYTE* ip, const BYTE* startLimit, const BYTE* matchlimit, int longest, const BYTE** matchpos, const BYTE** startpos, const int maxNbAttempts) +{ + U16* const chainTable = hc4->chainTable; + HTYPE* const HashTable = hc4->hashTable; + INITBASE(base,hc4->base); + const BYTE* ref; + int nbAttempts = maxNbAttempts; + int delta = (int)(ip-startLimit); + + /* First Match */ + LZ4HC_Insert(hc4, ip); + ref = HASH_POINTER(ip); + + while (((U32)(ip-ref) <= MAX_DISTANCE) && (nbAttempts)) + { + nbAttempts--; + if (*(startLimit + longest) == *(ref - delta + longest)) + if (A32(ref) == A32(ip)) + { +#if 1 + const BYTE* reft = ref+MINMATCH; + const BYTE* ipt = ip+MINMATCH; + const BYTE* startt = ip; + + while (iptstartLimit) && (reft > hc4->inputBuffer) && (startt[-1] == reft[-1])) {startt--; reft--;} + + if ((ipt-startt) > longest) + { + longest = (int)(ipt-startt); + *matchpos = reft; + *startpos = startt; + } + } + ref = GETNEXT(ref); + } + + return longest; +} + + +typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; + +FORCE_INLINE int LZ4HC_encodeSequence ( + const BYTE** ip, + BYTE** op, + const BYTE** anchor, + int matchLength, + const BYTE* ref, + limitedOutput_directive limitedOutputBuffer, + BYTE* oend) +{ + int length; + BYTE* token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if ((limitedOutputBuffer) && ((*op + length + (2 + 1 + LASTLITERALS) + (length>>8)) > oend)) return 1; /* Check output limit */ + if (length>=(int)RUN_MASK) { int len; *token=(RUN_MASK< 254 ; len-=255) *(*op)++ = 255; *(*op)++ = (BYTE)len; } + else *token = (BYTE)(length<>8) > oend)) return 1; /* Check output limit */ + if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; } + else *token += (BYTE)(length); + + /* Prepare next loop */ + *ip += matchLength; + *anchor = *ip; + + return 0; +} + + +#define MAX_COMPRESSION_LEVEL 16 +static int LZ4HC_compress_generic ( + void* ctxvoid, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + int compressionLevel, + limitedOutput_directive limit + ) +{ + LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid; + const BYTE* ip = (const BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = (iend - LASTLITERALS); + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + const int maxNbAttempts = compressionLevel > MAX_COMPRESSION_LEVEL ? 1 << MAX_COMPRESSION_LEVEL : compressionLevel ? 1<<(compressionLevel-1) : 1<end) return 0; + ctx->end += inputSize; + + ip++; + + /* Main Loop */ + while (ip < mflimit) + { + ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref), maxNbAttempts); + if (!ml) { ip++; continue; } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; + +_Search2: + if (ip+ml < mflimit) + ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2, maxNbAttempts); + else ml2 = ml; + + if (ml2 == ml) /* No better match */ + { + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0; + continue; + } + + if (start0 < ip) + { + if (start2 < ip + ml0) /* empirical */ + { + ip = start0; + ref = ref0; + ml = ml0; + } + } + + /* Here, start0==ip */ + if ((start2 - ip) < 3) /* First Match too small : removed */ + { + ml = ml2; + ip = start2; + ref =ref2; + goto _Search2; + } + +_Search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) + { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML; + if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) + { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */ + + if (start2 + ml2 < mflimit) + ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, maxNbAttempts); + else ml3 = ml2; + + if (ml3 == ml2) /* No better match : 2 sequences to encode */ + { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) ml = (int)(start2 - ip); + /* Now, encode 2 sequences */ + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0; + ip = start2; + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml2, ref2, limit, oend)) return 0; + continue; + } + + if (start3 < ip+ml+3) /* Not enough space for match 2 : remove it */ + { + if (start3 >= (ip+ml)) /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */ + { + if (start2 < ip+ml) + { + int correction = (int)(ip+ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) + { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0; + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _Search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _Search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least the first one + * ip & ref are known; Now for ml + */ + if (start2 < ip+ml) + { + if ((start2 - ip) < (int)ML_MASK) + { + int correction; + if (ml > OPTIMAL_ML) ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) + { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + else + { + ml = (int)(start2 - ip); + } + } + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0; + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _Search3; + + } + + /* Encode Last Literals */ + { + int lastRun = (int)(iend - anchor); + if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */ + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun< 0 + assert library._collection.versions.count() + assert repr(library.read('symbol').data) == repr(data) + # Nothing done_APPEND_COUNT + assert len(library._collection.versions.find_one({})['parent']) + else: + run_as_main(main, '--library', 'user.library', '--host', mongo_host, '-f') + assert library._collection.count() > 0 + assert library._collection.versions.count() + # Data still available (write with prune_previous_version will do the cleanup) + assert repr(library.read('symbol').data) == repr(data) + # Snapshot cleaned up + assert not len(library._collection.versions.find_one({})['parent']) + + +@pytest.mark.parametrize(['dry_run', 'data'], [(x, y) for (x, y) in itertools.product([True, False], + [some_object, ts])]) +def test_cleanup_orphaned_snapshots_nop(mongo_host, library, data, dry_run): + """ + Check that we do / don't cleanup chunks based on the dry-run + """ + yesterday = dt.utcnow() - dtd(days=1, seconds=1) + _id = bson.ObjectId.from_datetime(yesterday) + library.write('symbol', data, prune_previous_version=False) + with patch("bson.ObjectId", return_value=_id): + library.snapshot('snap_name') + + # No cleanup on dry-run + if dry_run: + run_as_main(main, '--library', 'user.library', '--host', mongo_host) + assert library._collection.count() > 0 + assert library._collection.versions.count() + assert repr(library.read('symbol').data) == repr(data) + # Nothing done + assert len(library._collection.versions.find_one({})['parent']) + else: + run_as_main(main, '--library', 'user.library', '--host', mongo_host, '-f') + assert library._collection.count() > 0 + assert library._collection.versions.count() + # Data still available (write with prune_previous_version will do the cleanup) + assert repr(library.read('symbol').data) == repr(data) + # Nothing done + assert len(library._collection.versions.find_one({})['parent']) + + +@pytest.mark.parametrize(['dry_run', 'data'], [(x, y) for (x, y) in itertools.product([True, False], + [some_object, ts])]) +def test_dont_cleanup_recent_orphaned_snapshots(mongo_host, library, data, dry_run): + """ + Check that we do / don't cleanup chunks based on the dry-run + """ + today = dt.utcnow() - dtd(hours=12, seconds=1) + _id = bson.ObjectId.from_datetime(today) + library.write('symbol', data, prune_previous_version=False) + with patch("bson.ObjectId", return_value=_id): + library.snapshot('snap_name') + + # Remove the version document ; should cleanup + assert library._collection.snapshots.delete_many({}) + + # No cleanup on dry-run + if dry_run: + run_as_main(main, '--library', 'user.library', '--host', mongo_host) + assert library._collection.count() > 0 + assert library._collection.versions.count() + assert repr(library.read('symbol').data) == repr(data) + # Nothing done + assert len(library._collection.versions.find_one({})['parent']) + else: + run_as_main(main, '--library', 'user.library', '--host', mongo_host, '-f') + assert library._collection.count() > 0 + assert library._collection.versions.count() + # Data still available (write with prune_previous_version will do the cleanup) + assert repr(library.read('symbol').data) == repr(data) + # Snapshot cleaned up + assert len(library._collection.versions.find_one({})['parent']) diff --git a/tests/integration/scripts/test_copy_data.py b/tests/integration/scripts/test_copy_data.py new file mode 100644 index 000000000..601531e29 --- /dev/null +++ b/tests/integration/scripts/test_copy_data.py @@ -0,0 +1,144 @@ +from mock import patch, call +from pandas.util.testing import assert_frame_equal +import pytest + +from arctic import arctic as m +from arctic.scripts import arctic_copy_data as mcd + +from ...util import read_str_as_pandas, run_as_main + + +@pytest.fixture(scope='function', autouse=True) +def init(arctic): + arctic.initialize_library('user.library', m.VERSION_STORE, segment='month') + arctic.initialize_library('user.library2', m.VERSION_STORE, segment='month') + + +ts = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 2.0 + 2012-10-09 17:06:11.040 | 2.5 + 2012-11-08 17:06:11.040 | 3.0""") +ts1 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 4.0 + 2012-10-08 17:06:11.040 | 5.0 + 2012-10-09 17:06:11.040 | 6.5 + 2012-11-08 17:06:11.040 | 7.0""") +ts2 = read_str_as_pandas(""" times | near + 2012-10-08 17:06:11.040 | 5.0 + 2012-10-09 17:06:11.040 | 6.5""") +ts3 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 5.0 + 2012-10-09 17:06:11.040 | 6.5 + 2012-11-08 17:06:11.040 | 3.0""") + +def test_copy_data_no_force(arctic, mongo_host): + src = 'user.library' + dest = 'user.library2' + # Put ts, ts1 in library + arctic[src].write('some_ts', ts1) + arctic[src].write('some_ts1', ts1) + + # Put some other value for ts in library2 + arctic[dest].write('some_ts', ts) + + # Create the user against the current mongo database + src_host = 'arctic_' + src + '@' + mongo_host + dest_host = 'arctic_' + dest + '@' + mongo_host + with patch('arctic.scripts.arctic_copy_data.logger') as logger: + run_as_main(mcd.main, '--src', src_host, '--dest', dest_host, '--log', 'CR101', 'some_ts', 'some_ts1') + + assert_frame_equal(ts, arctic[dest].read('some_ts').data) + assert_frame_equal(ts1, arctic[dest].read('some_ts1').data) + assert logger.info.call_args_list == [call('Copying data from %s -> %s' % (src_host, dest_host)), + call('Copying: 2 symbols')] + assert logger.warn.call_args_list == [call('Symbol: some_ts already exists in %s, use --force to overwrite or --splice to join with existing data' % dest_host)] + assert arctic[dest].read_audit_log('some_ts1')[0]['message'] == 'CR101' + + +def test_copy_data_force(arctic, mongo_host): + src = 'user.library' + dest = 'user.library2' + # Put ts, ts1 in library + arctic[src].write('some_ts', ts) + arctic[src].write('some_ts1', ts1) + + # Put some other value for ts in library2 + arctic[dest].write('some_ts', ts1) + + # Create the user against the current mongo database + src_host = src + '@' + mongo_host + dest_host = dest + '@' + mongo_host + with patch('arctic.scripts.arctic_copy_data.logger') as logger: + run_as_main(mcd.main, '--src', src_host, '--dest', dest_host, '--log', 'CR101', '--force', 'some_ts', 'some_ts1') + + assert_frame_equal(ts, arctic[dest].read('some_ts').data) + assert_frame_equal(ts1, arctic[dest].read('some_ts1').data) + assert logger.info.call_args_list == [call('Copying data from %s -> %s' % (src_host, dest_host)), + call('Copying: 2 symbols')] + assert logger.warn.call_args_list == [call('Symbol: some_ts already exists in destination, OVERWRITING')] + assert arctic[dest].read_audit_log('some_ts1')[0]['message'] == 'CR101' + + +def test_copy_data_splice(arctic, mongo_host): + src = 'user.library' + dest = 'user.library2' + # Put ts, ts1 in library + arctic[src].write('some_ts', ts2) + arctic[src].write('some_ts1', ts1) + + # Put some other value for ts in library2 + arctic[dest].write('some_ts', ts) + + # Create the user against the current mongo database + src_host = src + '@' + mongo_host + dest_host = dest + '@' + mongo_host + with patch('arctic.scripts.arctic_copy_data.logger') as logger: + run_as_main(mcd.main, '--src', src_host, '--dest', dest_host, '--log', 'CR101', '--splice', 'some_ts', 'some_ts1') + + assert_frame_equal(ts3, arctic[dest].read('some_ts').data) + assert_frame_equal(ts1, arctic[dest].read('some_ts1').data) + assert logger.info.call_args_list == [call('Copying data from %s -> %s' % (src_host, dest_host)), + call('Copying: 2 symbols')] + assert logger.warn.call_args_list == [call('Symbol: some_ts already exists in destination, splicing in new data')] + + assert arctic[dest].read_audit_log('some_ts')[0]['message'] == 'CR101' + + +def test_copy_data_wild(arctic, mongo_host): + src = 'user.library' + dest = 'user.library2' + # Put ts, ts1 in library + arctic[src].write('some_a_ts', ts) + arctic[src].write('some_a_ts1', ts1) + arctic[src].write('some_b_ts1', ts1) + arctic[src].write('some_c_ts1', ts1) + + # Create the user against the current mongo database + src_host = 'arctic_' + src + '@' + mongo_host + dest_host = 'arctic_' + dest + '@' + mongo_host + with patch('arctic.scripts.arctic_copy_data.logger') as logger: + run_as_main(mcd.main, '--src', src_host, '--dest', dest_host, '--log', 'CR101', '.*_a_.*', '.*_b_.*') + + assert_frame_equal(ts, arctic[dest].read('some_a_ts').data) + assert_frame_equal(ts1, arctic[dest].read('some_a_ts1').data) + assert_frame_equal(ts1, arctic[dest].read('some_b_ts1').data) + assert logger.info.call_args_list == [call('Copying data from %s -> %s' % (src_host, dest_host)), + call('Copying: 3 symbols')] + assert arctic[dest].read_audit_log('some_a_ts1')[0]['message'] == 'CR101' + + +def test_copy_data_doesnt_exist(arctic, mongo_host): + src = 'user.library' + dest = 'user.library2' + + # Create the user against the current mongo database + src_host = src + '@' + mongo_host + dest_host = dest + '@' + mongo_host + with patch('arctic.scripts.arctic_copy_data.logger') as logger: + run_as_main(mcd.main, '--src', src_host, '--dest', dest_host, '--log', 'CR101', 'some_ts') + + assert logger.info.call_args_list == [call('Copying data from %s -> %s' % (src_host, dest_host)), + call('Copying: 0 symbols')] + assert logger.warn.call_args_list == [call('No symbols found that matched those provided.')] diff --git a/tests/integration/scripts/test_create_user.py b/tests/integration/scripts/test_create_user.py new file mode 100644 index 000000000..b70f549cd --- /dev/null +++ b/tests/integration/scripts/test_create_user.py @@ -0,0 +1,108 @@ +from mock import patch +from StringIO import StringIO + +from arctic.auth import Credential +from arctic.scripts import arctic_create_user as mcu + +from ...util import run_as_main + + +def test_create_user(mongo_host, mongodb): + # Create the user agains the current mongo database + with patch('arctic.scripts.arctic_create_user.get_auth', + return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate',return_value=True): + run_as_main(mcu.main, '--host', mongo_host, 'user', '--pass', 'pass') + + # Check: + # User exists in system + user = mongodb.admin.system.users.find_one({'user': 'user'}) + assert user + assert user['readOnly'] == True + # User db exists + user = mongodb.arctic_user.system.users.find_one({'user': 'user'}) + assert user + assert 'readOnly' not in user or user['readOnly'] == False + + +def test_create_admin_user(mongo_host, mongodb): + # Create the user agains the current mongo database + + with patch('arctic.scripts.arctic_create_user.get_auth', + return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate', return_value=True): + run_as_main(mcu.main, '--host', mongo_host, 'user', '--pass', 'pass', '--admin-write') + + # Check: + # User exists in system + user = mongodb.admin.system.users.find_one({'user': 'user'}) + assert user + assert 'readOnly' not in user or user['readOnly'] == False + # User db exists + user = mongodb.arctic_user.system.users.find_one({'user': 'user'}) + assert user + assert 'readOnly' not in user or user['readOnly'] == False + + +def test_create_user_verbose(mongo_host, mongodb): + user = 'user' + pwd = 'password' + stderr = StringIO() + stdout = StringIO() + with patch('arctic.scripts.arctic_create_user.get_auth', + return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate', return_value=True), \ + patch('sys.stderr', stderr), \ + patch('sys.stdout', stdout): + run_as_main(mcu.main, '--host', mongo_host, user, '--pass', pwd, '--verbose') + out = stdout.getvalue() + assert 'Adding user %s to DB %s' % (user, mongo_host) in out + assert 'Adding database arctic_%s to DB %s' % (user, mongo_host) in out + + +def test_create_user_dryrun_nodb(mongo_host, mongodb): + user = 'user' + pwd = 'password' + stderr = StringIO() + stdout = StringIO() + with patch('arctic.scripts.arctic_create_user.get_auth', + return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate', return_value=True), \ + patch('sys.stderr', stderr), \ + patch('sys.stdout', stdout): + run_as_main(mcu.main, '--host', mongo_host, user, '--pass', pwd, '--dryrun', '--nodb') + out = stdout.getvalue() + assert 'DRYRUN: add user %s readonly True nodb True' % (user) in out + +def test_create_user_no_passwd(mongo_host, mongodb): + user = 'user' + pwd = None + newpwd = 'newpasswd' + stdout = StringIO() + with patch('arctic.scripts.arctic_create_user.get_auth', + return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate',return_value=True), \ + patch('base64.b64encode',return_value=newpwd), \ + patch('sys.stdout', stdout): + run_as_main(mcu.main, '--host', mongo_host, user) + out = stdout.getvalue() + assert '%-16s %s' % (user,newpwd) in out + + +def test_create_user_no_creds(mongo_host, mongodb): + stderr = StringIO() + with patch('arctic.scripts.arctic_create_user.get_auth', return_value=None), \ + patch('sys.stderr', stderr): + run_as_main(mcu.main, '--host', mongo_host) + err = stderr.getvalue() + assert 'You have no admin credentials' in err + + +def test_create_user_auth_fail(mongo_host): + stderr = StringIO() + with patch('arctic.scripts.arctic_create_user.get_auth', return_value=Credential('admin', 'user', 'pass')), \ + patch('pymongo.database.Database.authenticate', return_value=False), \ + patch('sys.stderr', stderr): + run_as_main(mcu.main, '--host', mongo_host) + err = stderr.getvalue() + assert 'Failed to authenticate' in err diff --git a/tests/integration/scripts/test_delete_library.py b/tests/integration/scripts/test_delete_library.py new file mode 100644 index 000000000..0ce944ffb --- /dev/null +++ b/tests/integration/scripts/test_delete_library.py @@ -0,0 +1,61 @@ +import getpass +import pytest + +from arctic.scripts import arctic_delete_library + +from ...util import run_as_main + + +@pytest.fixture(scope='function') +def library_name(): + return 'user.library' + + +@pytest.fixture(scope='function') +def user_library_name(): + return "{}.library".format(getpass.getuser()) + + +def test_delete_library(mongo_host, arctic, library, user_library): + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + run_as_main(arctic_delete_library.main, '--host', mongo_host, + '--library', 'user.library') + assert 'user.library' not in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + + +def test_delete_library1(mongo_host, arctic, library, user_library): + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + run_as_main(arctic_delete_library.main, '--host', mongo_host, + '--library', 'arctic_user.library') + assert 'user.library' not in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + + +def test_delete_library2(mongo_host, arctic, library, user_library): + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + run_as_main(arctic_delete_library.main, '--host', mongo_host, + '--library', 'arctic_%s.library' % getpass.getuser()) + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() not in arctic.list_libraries() + + +def test_delete_library3(mongo_host, arctic, library, user_library): + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + run_as_main(arctic_delete_library.main, '--host', mongo_host, + '--library', '%s.library' % getpass.getuser()) + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() not in arctic.list_libraries() + + +def test_delete_library_doesnt_exist(mongo_host, arctic, library, user_library): + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() + run_as_main(arctic_delete_library.main, '--host', mongo_host, + '--library', 'arctic_nosuchlibrary.missing') + assert 'user.library' in arctic.list_libraries() + assert '%s.library' % getpass.getuser() in arctic.list_libraries() diff --git a/tests/integration/scripts/test_enable_sharding.py b/tests/integration/scripts/test_enable_sharding.py new file mode 100644 index 000000000..ec59f3a3f --- /dev/null +++ b/tests/integration/scripts/test_enable_sharding.py @@ -0,0 +1,44 @@ +from mock import patch, Mock, call +import getpass +import pytest +from pymongo.errors import OperationFailure +from pymongo.read_preferences import Primary + +from arctic.hooks import get_mongodb_uri +from arctic.scripts import arctic_enable_sharding as mes + +from ...util import run_as_main + + +def test_enable_sharding(mongo_host, arctic, mongodb, user_library, user_library_name): + c = mongodb + with patch.object(c, 'admin') as admin: + with patch('pymongo.MongoClient', return_value=c) as mc: + run_as_main(mes.main, '--host', mongo_host, '--library', user_library_name) + assert mc.call_args_list == [call(get_mongodb_uri(mongo_host))] + assert admin.command.call_args_list == [call('buildinfo', read_preference=Primary()), + call('enablesharding', 'arctic_' + getpass.getuser()), + call('shardCollection', 'arctic_' + user_library_name, key={'symbol': 1})] + + +def test_enable_sharding_already_on_db(mongo_host, arctic, mongodb, user_library, user_library_name): + c = mongodb + with patch.object(c, 'admin') as admin: + admin.command = Mock(return_value=[OperationFailure("failed: already enabled"), + None]) + with patch('pymongo.MongoClient', return_value=c) as mc: + run_as_main(mes.main, '--host', mongo_host, '--library', user_library_name) + assert mc.call_args_list == [call(get_mongodb_uri(mongo_host))] + assert admin.command.call_args_list == [call('buildinfo', read_preference=Primary()), + call('enablesharding', 'arctic_' + getpass.getuser()), + call('shardCollection', 'arctic_' + user_library_name, key={'symbol': 1})] + + +def test_enable_sharding_on_db_other_failure(mongo_host, arctic, mongodb, user_library, user_library_name): + # Create the user agains the current mongo database + c = mongodb + with pytest.raises(OperationFailure): + with patch.object(c, 'admin') as admin: + with patch('pymongo.MongoClient', return_value=c): + admin.command = Mock(side_effect=OperationFailure('OOPS')) + run_as_main(mes.main, '--host', mongo_host, '--library', user_library_name) diff --git a/tests/integration/scripts/test_initialize_library.py b/tests/integration/scripts/test_initialize_library.py new file mode 100644 index 000000000..57ec93419 --- /dev/null +++ b/tests/integration/scripts/test_initialize_library.py @@ -0,0 +1,41 @@ +from mock import patch +import pytest + +from arctic.auth import Credential +from arctic.arctic import Arctic +from arctic.scripts import arctic_init_library as mil + +from ...util import run_as_main + + +def test_init_library(mongo_host): + # Create the user agains the current mongo database + with patch('arctic.scripts.arctic_init_library.do_db_auth', return_value=True), \ + patch('pymongo.database.Database.authenticate', return_value=True): + run_as_main(mil.main, '--host', mongo_host, '--library', 'arctic_user.library') + + # Should be able to write something to the library now + store = Arctic(mongo_host) + assert store['user.library']._arctic_lib.get_library_metadata('QUOTA') == 10240 * 1024 * 1024 + store['user.library'].write('key', {'a': 'b'}) + assert store['user.library'].read('key').data == {'a': 'b'} + + +def test_init_library_quota(mongo_host): + # Create the user agains the current mongo database + with patch('arctic.scripts.arctic_init_library.do_db_auth', return_value=True), \ + patch('pymongo.database.Database.authenticate', return_value=True): + run_as_main(mil.main, '--host', mongo_host, '--library', 'arctic_user.library', '--quota', '100') + + # Should be able to write something to the library now + store = Arctic(mongo_host) + assert store['user.library']._arctic_lib.get_library_metadata('QUOTA') == 100 * 1024 * 1024 * 1024 + + +def test_init_library_bad_library(mongo_host): + with pytest.raises(Exception): + with patch('arctic.arctic.get_auth', return_value=Credential('admin', 'adminuser', 'adminpwd', 'admin')), \ + patch('pymongo.database.Database.authenticate', return_value=True), \ + patch('argparse.ArgumentParser.error', side_effect=Exception): + # Create the user agains the current mongo database + run_as_main(mil.main, '--host', mongo_host, '--library', 'user') diff --git a/tests/integration/scripts/test_list_libraries.py b/tests/integration/scripts/test_list_libraries.py new file mode 100644 index 000000000..113f73a2e --- /dev/null +++ b/tests/integration/scripts/test_list_libraries.py @@ -0,0 +1,28 @@ +from mock import patch, call +import pytest + +from arctic.scripts import arctic_list_libraries + +from ...util import run_as_main + + +def test_list_library(mongo_host, library, library_name): + with patch('arctic.scripts.arctic_list_libraries.print') as p: + run_as_main(arctic_list_libraries.main, "--host", mongo_host) + for x in p.call_args_list: + if x == call(library_name): + return + assert False, "Failed to find a library" + + +def test_list_library_args(mongo_host, library, library_name): + with patch('arctic.scripts.arctic_list_libraries.print') as p: + run_as_main(arctic_list_libraries.main, "--host", mongo_host, library_name[:2]) + for x in p.call_args_list: + assert x[0][0].startswith(library_name[:2]) + + +def test_list_library_args_not_found(mongo_host, library, library_name): + with patch('arctic.scripts.arctic_list_libraries.print') as p: + run_as_main(arctic_list_libraries.main, "--host", mongo_host, 'some_library_which_doesnt_exist') + assert p.call_count == 0 diff --git a/tests/integration/scripts/test_prune_versions.py b/tests/integration/scripts/test_prune_versions.py new file mode 100644 index 000000000..8acd98ae0 --- /dev/null +++ b/tests/integration/scripts/test_prune_versions.py @@ -0,0 +1,39 @@ +from mock import patch, ANY, call + +from arctic.auth import Credential +from arctic.scripts import arctic_prune_versions as mpv + +from ...util import run_as_main + + +def test_prune_versions_symbol(mongo_host, library, library_name): + with patch('arctic.scripts.arctic_prune_versions.prune_versions', autospec=True) as prune_versions, \ + patch('arctic.scripts.utils.get_auth', return_value=Credential('admin', 'adminuser', 'adminpwd')), \ + patch('pymongo.database.Database.authenticate', return_value=True): + + run_as_main(mpv.main, '--host', mongo_host, '--library', library_name, '--symbols', 'sym1,sym2') + prune_versions.assert_has_calls([call(ANY, 'sym1', 10), + call(ANY, 'sym2', 10), ]) + + +def test_prune_versions_full(mongo_host, library, library_name): + with patch('arctic.scripts.arctic_prune_versions.do_db_auth', return_value=True): + # Write some stuff with snapshots + library.snapshot('snap') + library.write('symbol', "val1") + library.write('symbol', "val2") + library.snapshot('snap1') + library.write('symbol', "val3") + + # Prune older than 10 mins - nothing deleted + run_as_main(mpv.main, '--host', mongo_host, '--library', library_name, '--keep-mins', 10) + assert [x['version'] for x in library.list_versions('symbol')] == [3, 2, 1] + # Prune older than 0 minutes, v1 deleted + run_as_main(mpv.main, '--host', mongo_host, '--library', library_name, '--keep-mins', 0) + assert [x['version'] for x in library.list_versions('symbol')] == [3, 2] + + # Delete the snapshots + library.delete_snapshot('snap') + library.delete_snapshot('snap1') + run_as_main(mpv.main, '--host', mongo_host, '--library', library_name, '--keep-mins', 0) + assert [x['version'] for x in library.list_versions('symbol')] == [3] diff --git a/tests/integration/store/__init__.py b/tests/integration/store/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/store/test_ndarray_store.py b/tests/integration/store/test_ndarray_store.py new file mode 100644 index 000000000..ff061005f --- /dev/null +++ b/tests/integration/store/test_ndarray_store.py @@ -0,0 +1,380 @@ +import bson +from bson import ObjectId, Binary +import datetime +from datetime import datetime as dt, timedelta as dtd +from mock import patch +import numpy as np +from numpy.testing import assert_equal +import os +from pymongo.server_type import SERVER_TYPE +import pytest +import time + +from arctic.store._ndarray_store import NdarrayStore, _APPEND_COUNT +from arctic.store.version_store import register_versioned_storage + +from tests.integration.store.test_version_store import _query + + +register_versioned_storage(NdarrayStore) + + +def test_save_read_simple_ndarray(library): + ndarr = np.ones(1000) + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_read_simple_ndarray_from_secondary(library_secondary, library_name): + ndarr = np.ones(1000) + library_secondary.write('MYARR', ndarr) + with patch('pymongo.message.query', side_effect=_query(True, library_name)) as query, \ + patch('pymongo.server_description.ServerDescription.server_type', SERVER_TYPE.Mongos): + saved_arr = library_secondary.read('MYARR').data + assert query.call_count > 0 + assert np.all(ndarr == saved_arr) + + +def test_save_read_big_1darray(library): + ndarr = np.random.rand(5326, 6020).ravel() + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_save_and_resave_reuses_chunks(library): + with patch('arctic.store._ndarray_store._CHUNK_SIZE', 1000): + ndarr = np.random.rand(1024) + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + orig_chunks = library._collection.count() + assert orig_chunks == 9 + + # Concatenate more values + ndarr = np.concatenate([ndarr, np.random.rand(10)]) + # And change the original values - we're not a simple append + ndarr[0] = ndarr[1] = ndarr[2] = 0 + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + # Should contain the original chunks, but not double the number + # of chunks + new_chunks = library._collection.count() + assert new_chunks == 11 + + # We hit the update (rather than upsert) code path + assert library._collection.find({'parent': {'$size': 2}}).count() == 7 + + +def test_save_read_big_2darray(library): + ndarr = np.random.rand(5326, 6020) + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_get_info_bson_object(library): + ndarr = np.ones(1000) + library.write('MYARR', ndarr) + assert library._get_info('MYARR').startswith('''Handler: NdarrayStore''') + + +def test_save_read_ndarray_with_array_field(library): + ndarr = np.empty(10, dtype=[('A', 'int64'), ('B', 'float64', (2,))]) + ndarr['A'] = 1 + ndarr['B'] = 2 + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_append_ndarray_with_field_shape(library): + ndarr = np.empty(10, dtype=[('A', 'int64'), ('B', 'float64', (2,))]) + ndarr['A'] = 1 + ndarr['B'] = 2 + ndarr2 = np.empty(10, dtype=[('A', 'int64'), ('B', 'int64', (2,))]) + ndarr2['A'] = 1 + ndarr2['B'] = 2 + + library.write('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + ndarr3 = np.empty(20, dtype=[('A', 'int64'), ('B', 'float64', (2,))]) + ndarr3['A'] = 1 + ndarr3['B'] = 2 + assert np.all(ndarr3 == saved_arr) + + +def test_append_simple_ndarray(library): + ndarr = np.ones(1000, dtype='int64') + library.write('MYARR', ndarr) + library.append('MYARR', np.ones(1000, dtype='int64')) + library.append('MYARR', np.ones(1000, dtype='int64')) + library.append('MYARR', np.ones(2005, dtype='int64')) + saved_arr = library.read('MYARR').data + assert np.all(np.ones(5005, dtype='int64') == saved_arr) + + +def test_append_simple_ndarray_promoting_types(library): + ndarr = np.ones(100, dtype='int64') + library.write('MYARR', ndarr) + library.append('MYARR', np.ones(100, dtype='float64')) + library.append('MYARR', np.ones(100, dtype='int64')) + library.append('MYARR', np.ones(205, dtype='float64')) + saved_arr = library.read('MYARR').data + assert np.all(np.ones(505, dtype='float64') == saved_arr) + + +def test_save_read_ndarray(library): + ndarr = np.empty(1000, dtype=[('abc', 'int64')]) + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_multiple_write(library): + ndarr = np.empty(1000, dtype=[('abc', 'int64')]) + foo = np.empty(900, dtype=[('abc', 'int64')]) + library.write('MYARR', foo) + v1 = library.read('MYARR').version + library.write('MYARR', ndarr[:900]) + v2 = library.read('MYARR').version + library.append('MYARR', ndarr[-100:]) + v3 = library.read('MYARR').version + + assert np.all(ndarr == library.read('MYARR').data) + assert np.all(ndarr == library.read('MYARR', as_of=v3).data) + assert np.all(foo == library.read('MYARR', as_of=v1).data) + assert np.all(ndarr[:900] == library.read('MYARR', as_of=v2).data) + + +def test_cant_write_objects(): + store = NdarrayStore() + assert not store.can_write(None, None, np.array([object()])) + + +def test_promote_types(library): + ndarr = np.empty(1000, dtype=[('abc', 'int64')]) + library.write('MYARR', ndarr[:800]) + library.append('MYARR', ndarr[-200:].astype([('abc', 'float64')])) + saved_arr = library.read('MYARR').data + assert np.all(ndarr.astype([('abc', 'float64')]) == saved_arr) + + +def test_promote_types2(library): + ndarr = np.array(np.arange(1000), dtype=[('abc', 'float64')]) + library.write('MYARR', ndarr[:800]) + library.append('MYARR', ndarr[-200:].astype([('abc', 'int64')])) + saved_arr = library.read('MYARR').data + assert np.all(ndarr.astype([('abc', np.promote_types('float64', 'int64'))]) == saved_arr) + + +def test_save_read_large_ndarray(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(30 * 1024 * 1024 / dtype.itemsize).view(dtype=dtype) + assert len(ndarr.tostring()) > 16 * 1024 * 1024 + library.write('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(ndarr == saved_arr) + +def test_append_read_large_ndarray(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(50 * 1024 * 1024 / dtype.itemsize).view(dtype=dtype) + assert len(ndarr.tostring()) > 16 * 1024 * 1024 + library.write('MYARR1', ndarr) + # Exactly enough appends to trigger 2 re-compacts, so the result should be identical + # to writing the whole array at once + ndarr2 = np.arange(240).view(dtype=dtype) + for n in np.split(ndarr2, 120): + library.append('MYARR1', n) + + saved_arr = library.read('MYARR1').data + assert np.all(np.concatenate([ndarr, ndarr2]) == saved_arr) + + library.write('MYARR2', np.concatenate([ndarr, ndarr2])) + + version1 = library._read_metadata('MYARR1') + version2 = library._read_metadata('MYARR2') + assert version1['append_count'] == version2['append_count'] + assert version1['append_size'] == version2['append_size'] + assert version1['segment_count'] == version2['segment_count'] + assert version1['up_to'] == version2['up_to'] + + +def test_save_append_read_ndarray(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(30 * 1024 * 1024 / dtype.itemsize).view(dtype=dtype) + assert len(ndarr.tostring()) > 16 * 1024 * 1024 + library.write('MYARR', ndarr) + + sliver = np.arange(30).view(dtype=dtype) + library.append('MYARR', sliver) + + saved_arr = library.read('MYARR').data + assert np.all(np.concatenate([ndarr, sliver]) == saved_arr) + + library.append('MYARR', sliver) + saved_arr = library.read('MYARR').data + assert np.all(np.concatenate([ndarr, sliver, sliver]) == saved_arr) + + +def test_save_append_read_1row_ndarray(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(30 * 1024 * 1024 / dtype.itemsize).view(dtype=dtype) + assert len(ndarr.tostring()) > 16 * 1024 * 1024 + library.write('MYARR', ndarr) + + sliver = np.arange(1).view(dtype=dtype) + library.append('MYARR', sliver) + + saved_arr = library.read('MYARR').data + assert np.all(np.concatenate([ndarr, sliver]) == saved_arr) + + library.append('MYARR', sliver) + saved_arr = library.read('MYARR').data + assert np.all(np.concatenate([ndarr, sliver, sliver]) == saved_arr) + + +def test_append_too_large_ndarray(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(30 * 1024 * 1024 / dtype.itemsize).view(dtype=dtype) + assert len(ndarr.tostring()) > 16 * 1024 * 1024 + library.write('MYARR', ndarr) + library.append('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(np.concatenate([ndarr, ndarr]) == saved_arr) + + +def test_empty_append_promotes_dtype(library): + ndarr = np.array(["a", "b", "c"]) + ndarr2 = np.array([]) + library.write('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == ndarr) + + +def test_empty_append_promotes_dtype2(library): + ndarr = np.array([]) + ndarr2 = np.array(["a", "b", "c"]) + library.write('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == ndarr2) + + +def test_empty_append_promotes_dtype3(library): + ndarr = np.array([]) + ndarr2 = np.array(["a", "b", "c"]) + library.write('MYARR', ndarr) + library.append('MYARR', ndarr2) + library.append('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == np.hstack((ndarr2, ndarr2))) + + +def test_empty_append_concat_and_rewrite(library): + ndarr = np.array([]) + ndarr2 = np.array(["a", "b", "c"]) + library.write('MYARR', ndarr) + for _ in range(_APPEND_COUNT + 2): + library.append('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == ndarr2) + + +def test_empty_append_concat_and_rewrite_2(library): + ndarr2 = np.array(["a", "b", "c"]) + library.write('MYARR', ndarr2) + for _ in range(_APPEND_COUNT + 1): + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == np.hstack([ndarr2] * (_APPEND_COUNT + 2))) + + +def test_empty_append_concat_and_rewrite_3(library): + ndarr = np.array([]) + ndarr2 = np.array(["a", "b", "c"]) + library.write('MYARR', ndarr2) + for _ in range(_APPEND_COUNT + 1): + library.append('MYARR', ndarr) + saved_arr = library.read('MYARR').data + assert np.all(saved_arr == ndarr2) + + +def test_append_with_extra_columns(library): + ndarr = np.array([(2.1, 1, "a")], dtype=[('C', np.float), ('B', np.int), ('A', 'S1')]) + ndarr2 = np.array([("b", 2, 3.1, 'c', 4, 5.)], dtype=[('A', 'S1'), ('B', np.int), ('C', np.float), + ('D', 'S1'), ('E', np.int), ('F', np.float)]) + expected = np.array([("a", 1, 2.1, '', 0, np.nan), + ("b", 2, 3.1, 'c', 4, 5.)], + dtype=np.dtype([('A', 'S1'), ('B', np.int), ('C', np.float), + ('D', 'S1'), ('E', np.int), ('F', np.float)])) + library.write('MYARR', ndarr) + library.append('MYARR', ndarr2) + saved_arr = library.read('MYARR').data + + assert expected.dtype == saved_arr.dtype + assert_equal(expected.tolist(), saved_arr.tolist()) + + +def test_logging_of_bad_documents(library): + ndarr = np.array([(2.1, 1, "a")], dtype=[('C', np.float), ('B', np.int), ('A', 'S1')]) + library.write('MYARR', ndarr) + + doc = library._collection.find_one() + with patch('arctic.store._ndarray_store.decompress', side_effect=Exception("some-error")), \ + patch('arctic.decorators.datetime') as dt, \ + pytest.raises(Exception) as e: + dt.now.return_value = datetime.datetime(1970, 1, 1) + library.read('MYARR') + assert 'some-error' in str(e) + path = '/tmp/mongo_debug_' + str(os.getpid()) + '_' + str(doc['_id']) + '_1970-01-01 00:00:00' + with open(path, 'r') as f: + for l in f: + assert l.strip() == str(doc) + new_doc = eval(l.strip()) + assert doc['data'] == new_doc['data'] + + os.remove(path) + + +def test_save_append_delete_append(library): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(30 / dtype.itemsize).view(dtype=dtype) + v1 = library.write('MYARR', ndarr) + + sliver = np.arange(30).view(dtype=dtype) + v2 = library.append('MYARR', sliver) + + # intentionally leave an orphaned chunk lying around here + library._delete_version('MYARR', v2.version, do_cleanup=False) + + sliver2 = np.arange(start=10, stop=40).view(dtype=dtype) + # we can't append here, as the latest version is now out of sync with version_nums. + # This gets translated to a do_append by the handler anyway. + v3 = library.write('MYARR', np.concatenate([ndarr, sliver2])) + + assert np.all(ndarr == library.read('MYARR', as_of=v1.version).data) + + # Check that we don't get the orphaned chunk from v2 back again. + assert np.all(np.concatenate([ndarr, sliver2]) == library.read('MYARR', as_of=v3.version).data) + + +@pytest.mark.xfail(reason="delete_version not safe with append...") +def test_delete_version_shouldnt_break_read(library): + data = np.arange(30) + yesterday = dt.utcnow() - dtd(days=1, seconds=1) + _id = bson.ObjectId.from_datetime(yesterday) + with patch("bson.ObjectId", return_value=_id): + library.write('symbol', data, prune_previous_version=False) + + # Re-Write the data again + library.write('symbol', data, prune_previous_version=False) + library._delete_version('symbol', 1) + assert repr(library.read('symbol').data) == repr(data) diff --git a/tests/integration/store/test_pandas_store.py b/tests/integration/store/test_pandas_store.py new file mode 100644 index 000000000..d1d3b4197 --- /dev/null +++ b/tests/integration/store/test_pandas_store.py @@ -0,0 +1,601 @@ +from StringIO import StringIO +from datetime import datetime as dt, timedelta as dtd +from dateutil.rrule import rrule, DAILY +from pandas import DataFrame, Series, DatetimeIndex, MultiIndex, read_csv, Panel +from pandas.util.testing import assert_frame_equal +import numpy as np +import pytest +import io +import itertools +from mock import Mock +import string + +from arctic.store._pandas_ndarray_store import PandasDataFrameStore, PandasSeriesStore +from arctic.store.version_store import register_versioned_storage + +register_versioned_storage(PandasDataFrameStore) + + +def test_save_read_pandas_series(library): + s = Series(data=[1, 2, 3], index=[4, 5, 6]) + library.write('pandas', s) + saved = library.read('pandas').data + assert np.all(s == saved) + assert saved.name == "values" + + +def test_save_read_pandas_series_maintains_name(library): + s = Series(data=[1, 2, 3], index=[4, 5, 6], name="ADJ") + library.write('pandas', s) + saved = library.read('pandas').data + assert np.all(s == saved) + assert saved.name == "ADJ" + + +def test_save_read_pandas_series_with_multiindex(library): + df = Series(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)])) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_series_with_multiindex_and_name(library): + df = Series(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]), + name='Foo') + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + assert df.name == 'Foo' + + +def test_save_read_pandas_series_with_unicode_index_name(library): + df = Series(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_multiindex(library): + df = DataFrame(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)])) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_none_values(library): + df = DataFrame(data=[(1, None), (1, 3), (2, 2)]) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all((df.values == saved_df.values) | (np.isnan(df.values) & np.isnan(saved_df.values))) + + +def test_save_read_pandas_dataframe_with_unicode_index_name(library): + df = DataFrame(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + + +def test_cant_write_pandas_series_with_tuple_values(library): + df = Series(data=[('A', 'BC')], index=np.array([dt(2013, 1, 1), ]).astype('datetime64[ns]')) + assert PandasSeriesStore().can_write(Mock(), 'FOO', df) == False + + +def test_save_read_pandas_series_with_datetimeindex_with_timezone(library): + df = Series(data=['A', 'BC', 'DEF'], index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), + tz="America/Chicago")) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert df.index.tz == saved_df.index.tz + assert all(df.index == saved_df.index) + + +def test_save_read_pandas_series_with_datetimeindex(library): + df = Series(data=['A', 'BC', 'DEF'], index=np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]')) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.index == saved_df.index) + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_datetimeindex_with_timezone(library): + df = DataFrame(data=['A', 'BC', 'DEF'], index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), + tz="America/Chicago")) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert df.index.tz == saved_df.index.tz + assert all(df.index == saved_df.index) + + +def test_save_read_pandas_dataframe_with_datetimeindex(library): + df = DataFrame(data=['A', 'BC', 'DEF'], index=np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]')) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.index == saved_df.index) + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_strings(library): + df = DataFrame(data=['A', 'BC', 'DEF'], index=[4, 5, 6]) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe(library): + df = DataFrame(data=[1, 2, 3], index=[4, 5, 6]) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_empty_dataframe(library): + df = DataFrame({'a': [], 'b': []}) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe2(library): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_strings(library): + df = DataFrame(data=['a', 'b', 'c'], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_empty_multiindex(library): + expected = read_csv(io.BytesIO('''\ +STRATEGY MAC INSTRUMENT CONTRACT $Price $Delta $Gamma $Vega $Theta $Notional uDelta uGamma uVega uTheta Delta Gamma Vega Theta'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + library.write('pandas', expected) + saved_df = library.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def test_save_read_pandas_dataframe_empty_multiindex_and_no_columns(library): + expected = read_csv(io.BytesIO('''STRATEGY MAC INSTRUMENT CONTRACT'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + library.write('pandas', expected) + saved_df = library.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def test_save_read_pandas_dataframe_multiindex_and_no_columns(library): + expected = read_csv(io.BytesIO('''\ +STRATEGY MAC INSTRUMENT CONTRACT +STRAT F22 ASD 201312'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + library.write('pandas', expected) + saved_df = library.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def test_append_pandas_dataframe(library): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + df2 = DataFrame(data=[4, 5, 6], index=DatetimeIndex(start='2/1/2011', periods=3, freq='H')) + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def test_empty_dataframe_multindex(library): + df = DataFrame({'a': [], 'b': [], 'c': []}) + df = df.groupby(['a', 'b']).sum() + print df + library.write('pandas', df) + saved_df = library.read('pandas').data + assert np.all(df.values == saved_df.values) + assert np.all(df.index.names == df.index.names) + + +def test_dataframe_append_empty(library): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + df2 = DataFrame(data=[], index=[]) + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def test_empy_dataframe_append(library): + df = DataFrame(data=[], index=[]) + df2 = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def test_dataframe_append_empty_multiindex(library): + df = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + assert np.all(df.index.names == saved_df.index.names) + + +def test_empty_dataframe_append_multiindex(library): + df = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a', 'b']).sum() + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + assert np.all(df.index.names == saved_df.index.names) + + +def test_empty_dataframe_should_ignore_dtype(library): + df = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a']).sum() + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df2.index.names == saved_df.index.names) + + +def test_empty_dataframe_should_ignore_dtype2(library): + df = DataFrame({'a': []}) + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a']).sum() + library.write('pandas', df) + library.append('pandas', df2) + saved_df = library.read('pandas').data + assert np.all(df2.values == saved_df.values) + assert np.all(df2.index.names == saved_df.index.names) + + +def test_dataframe_append_should_promote_string_column(library): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), ], names=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a30')]) + data2[:] = [(3, 4., 'Hello World - Good Morning')] + df2 = DataFrame(data2, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a30')]) + expected_data[:] = [(1, 2., 'Hello'), (2, 3., "World"), (3, 4., 'Hello World - Good Morning')] + expected = DataFrame(expected_data, MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], + names=[u'DATETIME'])) + + library.write('pandas', df) + library.append('pandas', df2) + actual = library.read('pandas').data + + assert_frame_equal(expected, actual) + + +def test_dataframe_append_should_add_new_column(library): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), ], names=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10'), ('D', 'f4')]) + data2[:] = [(4, 5., 'Hi', 6.)] + df2 = DataFrame(data2, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10'), ('D', 'f4')]) + expected_data[:] = [(1, 2., 'Hello', np.nan), (2, 3., "World", np.nan), (4, 5., 'Hi', 6.)] + expected = DataFrame(expected_data, MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], + names=[u'DATETIME'])) + + library.write('pandas', df) + library.append('pandas', df2) + actual = library.read('pandas').data + + assert_frame_equal(expected, actual) + + +def test_dataframe_append_should_add_new_columns_and_reorder(library): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), ], names=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('C', 'a10'), ('A', 'i4'), ('E', 'a1'), ('B', 'f4'), ('D', 'f4'), ('F', 'i4')]) + data2[:] = [('Hi', 4, 'Y', 5., 6., 7)] + df2 = DataFrame(data2, index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('C', 'a10'), ('A', 'i4'), ('E', 'a1'), + ('B', 'f4'), ('D', 'f4'), ('F', 'i4')]) + expected_data[:] = [('Hello', 1, '', 2., np.nan, 0), ("World", 2, '', 3., np.nan, 0), ('Hi', 4, 'Y', 5., 6., 7)] + expected = DataFrame(expected_data, MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], + names=[u'DATETIME'])) + + library.write('pandas', df) + library.append('pandas', df2) + actual = library.read('pandas').data + + assert_frame_equal(expected, actual) + + +# -- auto generated tests --- # +def dataframe(columns, length, index): + df = DataFrame(np.ones((length, columns)), columns=list(string.ascii_lowercase[:columns])) + index = min(index, columns) + if index: + df = df.set_index(list(string.ascii_lowercase[:index])) + return df + + + +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([0, 1, 2, 4], r=3))) +def test_dataframe_save_read(library, df_size): + df = dataframe(*df_size) + library.write('pandas', df) + result = library.read('pandas').data + assert np.all(df.values == result.values), str(df.values) + "!=" + str(result.values) + if None not in df.index.names: # saved as 'index' or 'level' + assert np.all(df.index.names == result.index.names), str(df.index.names) + "!=" + str(result.index.names) + assert np.all(df.index.values == result.index.values), str(df.index.values) + "!=" + str(result.index.values) + assert np.all(df.columns.values == result.columns.values), str(df.columns.values) + "!=" + str(result.columns.values) + + +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([0, 1, 2, 4], r=3))) +def test_dataframe_save_append_read(library, df_size): + df = dataframe(*df_size) + library.write('pandas', df) + library.append('pandas', df) + result = library.read('pandas').data + assert len(result) == len(df) * 2 + if None not in df.index.names: # saved as 'index' or 'level' + assert np.all(df.index.names == result.index.names), str(df.index.names) + "!=" + str(result.index.names) + assert np.all(df.columns.values == result.columns.values), str(df.columns.values) + "!=" + str(result.columns.values) + + + +def test_large_dataframe_append_rewrite_same_item(library): + csv = \ +"""index, f1, f2, f3, f4, f5, f6, f7, f8, iVol, tau, uPrice, uDelta, uGamma, uVega, uTheta, Delta, Gamma, Vega, Theta, $Price, $Delta, $Gamma, $Vega, $Theta, $Time_Value, $Notional, FX, f9 +0, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, CALL, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.5768068954653813, 0.6427860135978315, 0.391592427081917, 4.915801583071703, -20.166163353481476, 9.641790203967473, 5.873886406228755, 73.73702374607555, -302.49245030222215, 11909.274289984183, 18625.940769791625, 15925.131550993763, 1014.9606370552315, -1601.4183005499872, 4786.093789984206, 2897689.1805000002, 1.37646, SYM +1, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, -5.358002241713311, 5.873886406228755, 73.73702374607555, -302.50057493980034, 4786.192353109285, -10350.550083271604, 15925.131550993763, 1014.9606370552315, -1601.4613130062987, 4786.192353109285, 2897689.1805000002, 1.37646, SYM +2, 201401, 2013 - 12 - 20 16:15:00, -48.0, F22, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -5.738886206065227, -9.829995990815009, -126.18319185932137, 529.3397696979075, -3772.3383984361194, -11086.338978290602, -26650.835319775462, -1736.8611626668148, 2802.3654592245452, -3772.3383984361194, -9272605.3776, 1.37646, SYM +3, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -83.2851813261039, -47.49312636245157, -4452.541332815905, 967.541433029926, -147525.24472279268, -160889.7125497546, -128762.15724702866, -61287.4504296778, 5122.238772724507, -147525.24472279268, -55249273.7082, 1.37646, SYM +4, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -55.816886577678304, -37.53526875242445, -3467.56379683394, 742.5377607142022, -88047.84694353123, -107826.65888355605, -101764.66675460352, -47729.62863790045, 3931.052023510272, -88047.84694353123, -50999329.576799996, 1.37646, SYM +5, 201401, 2013 - 12 - 20 16:15:00, -350.0, F22, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -18.747569933353994, -37.422354713501335, -502.3611320768588, 2203.8743830073104, -11079.355260832921, -36216.420371031316, -101458.53708176922, -6914.80003858513, 11667.480512439395, -11079.355260832921, -67612747.545, 1.37646, SYM +6, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -6.250282470010408, -4.881689031462497, -445.634554775733, 94.30069881989306, -8837.042047978748, -12074.25059227865, -13235.111243323556, -6133.9813926660545, 499.23515345242305, -8837.042047978748, -8306708.9841, 1.37646, SYM +7, 201401, 2013 - 12 - 20 16:15:00, -557.0, F22, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -13.001732844932882, -28.637958149630503, -404.89795750367495, 1870.8354898520474, -7172.696641740786, -25116.653728342328, -77642.50435641785, -5573.258425855083, 9904.346993699035, -7172.696641740786, -107600858.2359, 1.37646, SYM +8, 201401, 2013 - 12 - 20 16:15:00, -607.0, F22, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -6.21340878871455, -14.573662229424174, -217.5395148200721, 1061.1931941992289, -3283.2053243209966, -12003.018280721177, -39511.74267470002, -2994.344405692364, 5618.038400336425, -3283.2053243209966, -117259822.1709, 1.37646, SYM +9, 201401, 2013 - 12 - 20 16:15:00, -799.0, F22, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -3.67256511020681, -8.962679290211902, -141.2168777833172, 727.1473791081288, -1891.026786204374, -7094.634789685377, -24299.388322293227, -1943.793835936248, 3849.574159412212, -1891.026786204374, -154350243.6813, 1.37646, SYM +10, 201401, 2013 - 12 - 20 16:15:00, -377.0, F22, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.8028324007636266, -2.007522435578226, -33.3372888974667, 180.92034663303465, -407.4960157678369, -1550.905840965067, -5442.743810001693, -458.87444675807006, 957.8062320250265, -407.4960157678369, -72828588.06989999, 1.37646, SYM +11, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 5.132620238108891, -3.4859412142879673, -389.0162662945974, 100.63494106610229, -8599.471252145018, 9915.158754388978, -9450.995231657676, -5354.653299038616, 532.7691191532583, -8599.471252145018, -8306708.9841, 1.37646, SYM +12, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 42.252164750761594, -26.77313184223898, -2916.44917566044, 736.4308784923738, -74018.27549798116, 81622.4271006569, -72586.63466280713, -40143.75632329569, 3898.7217192677417, -74018.27549798116, -50999329.576799996, 1.37646, SYM +13, 201401, 2013 - 12 - 20 16:15:00, -376.0, F22, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 8.129432111155017, -16.588182788574088, -256.2233515569436, 1293.376935891353, -4877.913910511415, 15704.378314735444, -44973.45961948536, -3526.811944840706, 6847.236989142353, -4877.913910511415, -72635408.7912, 1.37646, SYM +14, 201401, 2013 - 12 - 20 16:15:00, -301.0, F22, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 13.832831800210249, -26.270894483076166, -382.319054437795, 1818.2895157389635, -8696.984965596635, 26722.164695430383, -71224.98149804553, -5262.468856714474, 9626.164564746361, -8696.984965596635, -58146962.8887, 1.37646, SYM +15, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 60.479027414159546, -35.32646904379539, -3756.917881714376, 926.127984537317, -111492.45225751627, 116832.94892344868, -95776.22511698281, -51712.4718746457, 4902.992790754752, -111492.45225751627, -55249273.7082, 1.37646, SYM +16, 201401, 2013 - 12 - 20 16:15:00, -739.0, F22, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 71.50658521495254, -121.45295017532867, -1668.0672036283765, 7486.937257302868, -48400.084510256995, 138135.90554124617, -329280.15202122304, -22960.277831063155, 39636.42175841195, -48400.084510256995, -142759486.9593, 1.37646, SYM +17, 201401, 2013 - 12 - 20 16:15:00, -669.0, F22, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 130.4367125693822, -186.30959150662477, -2430.001478055598, 10357.656693727877, -98837.84833028633, 251976.70050152476, -505117.8297913038, -33447.99834484408, 54834.231279417974, -98837.84833028633, -129236937.4503, 1.37646, SYM +18, 201401, 2013 - 12 - 20 16:15:00, -471.0, F22, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 168.24127038979793, -184.4400331555829, -2315.3425456267723, 9498.518053109732, -150286.43988763154, 325007.2726147283, -500049.1307012041, -31869.76400353427, 50285.885228397776, -150286.43988763154, -90987440.2677, 1.37646, SYM +19, 201401, 2013 - 12 - 20 16:15:00, -364.0, F22, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 274.5707270439409, -121.26988885703983, -1509.8704335163682, 6143.031624397461, -396952.9396004471, 530413.7500248309, -328783.8408272312, -20782.762569179402, 32521.681960454345, -68777.34640044652, -70317257.4468, 1.37646, SYM +20, 201401, 2013 - 12 - 20 16:15:00, -394.0, F22, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 346.8878572984298, -80.68788375793986, -1035.7536998452629, 4344.282826256274, -657341.595950662, 670115.460626992, -218758.93991649026, -14256.735376890107, 22998.967457802737, -30955.94375066146, -76112635.8078, 1.37646, SYM +21, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -11.648277108545999, -6.642395295447772, -622.733053540686, 135.32048014404558, -20632.901359831147, -22502.057699266377, -18008.69332126275, -8571.671388766126, 716.3970311502808, -20632.901359831147, -7727171.148, 1.37646, SYM +22, 201401, 2013 - 12 - 20 16:15:00, -12.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -1.4347215515163068, -2.4574989977037522, -31.545797964830342, 132.33494242447688, -943.0845996090299, -2771.5847445726504, -6662.708829943866, -434.2152906667037, 700.5913648061363, -943.0845996090299, -2318151.3444, 1.37646, SYM +23, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -9.514242030286075, -6.398057173708713, -591.0620108239671, 126.56893648537539, -15008.155729011005, -18379.544127878875, -17346.250014989233, -8135.732154187577, 670.0656858256148, -15008.155729011005, -8693067.5415, 1.37646, SYM +24, 201401, 2013 - 12 - 20 16:15:00, -57.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -3.0531756748605074, -6.09449776762736, -81.813098652517, 358.9166852326191, -1804.3521424785042, -5898.102746139386, -16523.247467602414, -1126.1245777124357, 1900.1325405972727, -1804.3521424785042, -11011218.8859, 1.37646, SYM +25, 201401, 2013 - 12 - 20 16:15:00, -68.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -1.5872851588068868, -3.496195967997979, -49.430989425942364, 228.39643323148874, -875.6613494405268, -3066.306020695293, -9478.797659311334, -680.3977970523262, 1209.1482864839038, -875.6613494405268, -13136190.9516, 1.37646, SYM +26, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -2.7617527193069247, -2.1570253859950568, -196.90829164509134, 41.66775064134809, -3904.7395095720058, -5335.133982634753, -5848.072409840642, -2710.3638711780245, 220.5922771068846, -3904.7395095720058, -3670406.2953000003, 1.37646, SYM +27, 201401, 2013 - 12 - 20 16:15:00, -91.0, GEE1, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -0.9314995053921319, -2.1848488680026357, -32.613007987852654, 159.091566181433, -492.21035339902915, -1799.464025610588, -5923.5067271790795, -448.9050097495967, 842.2429891772894, -492.21035339902915, -17579314.3617, 1.37646, SYM +28, 201401, 2013 - 12 - 20 16:15:00, -117.0, GEE1, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -0.5377848784658282, -1.3124323866768368, -20.678816896931302, 106.47840219731049, -276.9088034867481, -1038.8889491779587, -3558.233333802638, -284.63564305950064, 563.7048518788846, -276.9088034867481, -22601975.6079, 1.37646, SYM +29, 201401, 2013 - 12 - 20 16:15:00, -126.0, GEE1, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.26832064322603966, -0.6709491429253489, -11.141905573158631, 60.46674715056331, -136.19230235211526, -518.3398831872638, -1819.0602654117067, -153.3638734522993, 320.1156107033245, -136.19230235211526, -24340589.1162, 1.37646, SYM +30, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 2.2679019656760215, -1.5402996063132879, -171.8909083627291, 44.46660186641729, -3799.766367226869, 4381.11665891606, -4176.021148871998, -2366.009597249621, 235.40961078864902, -3799.766367226869, -3670406.2953000003, 1.37646, SYM +31, 201401, 2013 - 12 - 20 16:15:00, -64.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 1.3837331253029816, -2.8235204746509086, -43.612485371394655, 220.14926568363455, -830.2832188104537, 2673.0856705932674, -7655.056956508147, -600.3084161430988, 1165.48714708806, -830.2832188104537, -12363473.8368, 1.37646, SYM +32, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 7.2020735370616356, -4.563602018563462, -497.122018578484, 125.52799065210918, -12616.751505337697, 13912.913710339246, -12372.721817523941, -6842.685736925402, 664.5548385115469, -12616.751505337697, -8693067.5415, 1.37646, SYM +33, 201401, 2013 - 12 - 20 16:15:00, -51.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 2.3437688432249923, -4.451214679856759, -64.77831154926095, 308.08227675311343, -1473.5755257323203, 4527.675745737374, -12068.020120931302, -891.6475471509574, 1631.011271767656, -1473.5755257323203, -9852143.2137, 1.37646, SYM +34, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 8.458605232749587, -4.940764901230125, -525.443060379633, 129.5283894457786, -15593.349966086193, 16340.27257670611, -13395.276240137457, -7232.5135489014965, 685.733257448217, -15593.349966086193, -7727171.148, 1.37646, SYM +35, 201401, 2013 - 12 - 20 16:15:00, -98.0, GEE1, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 9.4826053465025, -16.10607458346713, -221.20512307927052, 992.855008410935, -6418.414454675487, 18318.42861034117, -43666.38010565609, -3044.8000371369267, 5256.250787989675, -6418.414454675487, -18931569.312599998, 1.37646, SYM +36, 201401, 2013 - 12 - 20 16:15:00, -111.0, GEE1, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 21.64196576263292, -30.912353747735946, -403.18410174016645, 1718.5349671207687, -16399.10487991298, 41807.79335675523, -83808.78790259302, -5549.667886812695, 9098.05631093482, -16399.10487991298, -21442899.9357, 1.37646, SYM +37, 201401, 2013 - 12 - 20 16:15:00, -108.0, GEE1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 38.577616140335834, -42.29198212484704, -530.9065709717439, 2178.0041395665626, -34460.58494238685, 74523.96059955555, -114660.94716715509, -7307.7165867976655, 11530.52145364535, -34460.58494238685, -20863362.0996, 1.37646, SYM +38, 201401, 2013 - 12 - 20 16:15:00, -83.0, GEE1, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 62.608160287492026, -27.652199931687655, -344.28364280730375, 1400.746222046674, -90513.99446933273, 120945.99245071695, -74969.94172708844, -4738.926629785414, 7415.658249224481, -15682.746569332587, -16033880.132100001, 1.37646, SYM +39, 201401, 2013 - 12 - 20 16:15:00, -56.0, GEE1, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 49.30385789013216, -11.468328655950843, -147.21372383587493, 617.4615184526685, -93429.26236862202, 95244.83704343032, -31092.641206404707, -2026.3380231112837, 3268.888775728308, -4399.8295686219335, -10818039.607199999, 1.37646, SYM""" + csv = StringIO(csv) + df = read_csv(csv).set_index(['index']) + for _ in range(10): + library.write('pandas', df[:-2]) + result = library.read('pandas').data + assert len(result) == len(df[:-2]) + assert np.all(df[:-2].values == result.values) + assert np.all(df[:-2].columns.values == result.columns.values) + for _ in range(10): + library.write('pandas', df[:-1]) + result = library.read('pandas').data + assert len(result) == len(df[:-1]) + assert np.all(df[:-1].values == result.values) + assert np.all(df[:-1].columns.values == result.columns.values) + for _ in range(10): + library.write('pandas', df) + result = library.read('pandas').data + assert len(result) == len(df) + assert np.all(df.values == result.values) + assert np.all(df.columns.values == result.columns.values) + + +def test_large_dataframe_rewrite_same_item(library): + csv = \ +"""index, f1, f2, f3, f4, f5, f6, f7, f8, iVol, tau, uPrice, uDelta, uGamma, uVega, uTheta, Delta, Gamma, Vega, Theta, $Price, $Delta, $Gamma, $Vega, $Theta, $Time_Value, $Notional, FX, f9 +0, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, CALL, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.5768068954653813, 0.6427860135978315, 0.391592427081917, 4.915801583071703, -20.166163353481476, 9.641790203967473, 5.873886406228755, 73.73702374607555, -302.49245030222215, 11909.274289984183, 18625.940769791625, 15925.131550993763, 1014.9606370552315, -1601.4183005499872, 4786.093789984206, 2897689.1805000002, 1.37646, SYM +1, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, -5.358002241713311, 5.873886406228755, 73.73702374607555, -302.50057493980034, 4786.192353109285, -10350.550083271604, 15925.131550993763, 1014.9606370552315, -1601.4613130062987, 4786.192353109285, 2897689.1805000002, 1.37646, SYM +2, 201401, 2013 - 12 - 20 16:15:00, -48.0, F22, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -5.738886206065227, -9.829995990815009, -126.18319185932137, 529.3397696979075, -3772.3383984361194, -11086.338978290602, -26650.835319775462, -1736.8611626668148, 2802.3654592245452, -3772.3383984361194, -9272605.3776, 1.37646, SYM +3, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -83.2851813261039, -47.49312636245157, -4452.541332815905, 967.541433029926, -147525.24472279268, -160889.7125497546, -128762.15724702866, -61287.4504296778, 5122.238772724507, -147525.24472279268, -55249273.7082, 1.37646, SYM +4, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -55.816886577678304, -37.53526875242445, -3467.56379683394, 742.5377607142022, -88047.84694353123, -107826.65888355605, -101764.66675460352, -47729.62863790045, 3931.052023510272, -88047.84694353123, -50999329.576799996, 1.37646, SYM +5, 201401, 2013 - 12 - 20 16:15:00, -350.0, F22, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -18.747569933353994, -37.422354713501335, -502.3611320768588, 2203.8743830073104, -11079.355260832921, -36216.420371031316, -101458.53708176922, -6914.80003858513, 11667.480512439395, -11079.355260832921, -67612747.545, 1.37646, SYM +6, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -6.250282470010408, -4.881689031462497, -445.634554775733, 94.30069881989306, -8837.042047978748, -12074.25059227865, -13235.111243323556, -6133.9813926660545, 499.23515345242305, -8837.042047978748, -8306708.9841, 1.37646, SYM +7, 201401, 2013 - 12 - 20 16:15:00, -557.0, F22, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -13.001732844932882, -28.637958149630503, -404.89795750367495, 1870.8354898520474, -7172.696641740786, -25116.653728342328, -77642.50435641785, -5573.258425855083, 9904.346993699035, -7172.696641740786, -107600858.2359, 1.37646, SYM +8, 201401, 2013 - 12 - 20 16:15:00, -607.0, F22, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -6.21340878871455, -14.573662229424174, -217.5395148200721, 1061.1931941992289, -3283.2053243209966, -12003.018280721177, -39511.74267470002, -2994.344405692364, 5618.038400336425, -3283.2053243209966, -117259822.1709, 1.37646, SYM +9, 201401, 2013 - 12 - 20 16:15:00, -799.0, F22, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -3.67256511020681, -8.962679290211902, -141.2168777833172, 727.1473791081288, -1891.026786204374, -7094.634789685377, -24299.388322293227, -1943.793835936248, 3849.574159412212, -1891.026786204374, -154350243.6813, 1.37646, SYM +10, 201401, 2013 - 12 - 20 16:15:00, -377.0, F22, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.8028324007636266, -2.007522435578226, -33.3372888974667, 180.92034663303465, -407.4960157678369, -1550.905840965067, -5442.743810001693, -458.87444675807006, 957.8062320250265, -407.4960157678369, -72828588.06989999, 1.37646, SYM +11, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 5.132620238108891, -3.4859412142879673, -389.0162662945974, 100.63494106610229, -8599.471252145018, 9915.158754388978, -9450.995231657676, -5354.653299038616, 532.7691191532583, -8599.471252145018, -8306708.9841, 1.37646, SYM +12, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 42.252164750761594, -26.77313184223898, -2916.44917566044, 736.4308784923738, -74018.27549798116, 81622.4271006569, -72586.63466280713, -40143.75632329569, 3898.7217192677417, -74018.27549798116, -50999329.576799996, 1.37646, SYM +13, 201401, 2013 - 12 - 20 16:15:00, -376.0, F22, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 8.129432111155017, -16.588182788574088, -256.2233515569436, 1293.376935891353, -4877.913910511415, 15704.378314735444, -44973.45961948536, -3526.811944840706, 6847.236989142353, -4877.913910511415, -72635408.7912, 1.37646, SYM +14, 201401, 2013 - 12 - 20 16:15:00, -301.0, F22, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 13.832831800210249, -26.270894483076166, -382.319054437795, 1818.2895157389635, -8696.984965596635, 26722.164695430383, -71224.98149804553, -5262.468856714474, 9626.164564746361, -8696.984965596635, -58146962.8887, 1.37646, SYM +15, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 60.479027414159546, -35.32646904379539, -3756.917881714376, 926.127984537317, -111492.45225751627, 116832.94892344868, -95776.22511698281, -51712.4718746457, 4902.992790754752, -111492.45225751627, -55249273.7082, 1.37646, SYM +16, 201401, 2013 - 12 - 20 16:15:00, -739.0, F22, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 71.50658521495254, -121.45295017532867, -1668.0672036283765, 7486.937257302868, -48400.084510256995, 138135.90554124617, -329280.15202122304, -22960.277831063155, 39636.42175841195, -48400.084510256995, -142759486.9593, 1.37646, SYM +17, 201401, 2013 - 12 - 20 16:15:00, -669.0, F22, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 130.4367125693822, -186.30959150662477, -2430.001478055598, 10357.656693727877, -98837.84833028633, 251976.70050152476, -505117.8297913038, -33447.99834484408, 54834.231279417974, -98837.84833028633, -129236937.4503, 1.37646, SYM +18, 201401, 2013 - 12 - 20 16:15:00, -471.0, F22, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 168.24127038979793, -184.4400331555829, -2315.3425456267723, 9498.518053109732, -150286.43988763154, 325007.2726147283, -500049.1307012041, -31869.76400353427, 50285.885228397776, -150286.43988763154, -90987440.2677, 1.37646, SYM +19, 201401, 2013 - 12 - 20 16:15:00, -364.0, F22, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 274.5707270439409, -121.26988885703983, -1509.8704335163682, 6143.031624397461, -396952.9396004471, 530413.7500248309, -328783.8408272312, -20782.762569179402, 32521.681960454345, -68777.34640044652, -70317257.4468, 1.37646, SYM +20, 201401, 2013 - 12 - 20 16:15:00, -394.0, F22, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 346.8878572984298, -80.68788375793986, -1035.7536998452629, 4344.282826256274, -657341.595950662, 670115.460626992, -218758.93991649026, -14256.735376890107, 22998.967457802737, -30955.94375066146, -76112635.8078, 1.37646, SYM +21, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -11.648277108545999, -6.642395295447772, -622.733053540686, 135.32048014404558, -20632.901359831147, -22502.057699266377, -18008.69332126275, -8571.671388766126, 716.3970311502808, -20632.901359831147, -7727171.148, 1.37646, SYM +22, 201401, 2013 - 12 - 20 16:15:00, -12.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -1.4347215515163068, -2.4574989977037522, -31.545797964830342, 132.33494242447688, -943.0845996090299, -2771.5847445726504, -6662.708829943866, -434.2152906667037, 700.5913648061363, -943.0845996090299, -2318151.3444, 1.37646, SYM +23, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -9.514242030286075, -6.398057173708713, -591.0620108239671, 126.56893648537539, -15008.155729011005, -18379.544127878875, -17346.250014989233, -8135.732154187577, 670.0656858256148, -15008.155729011005, -8693067.5415, 1.37646, SYM +24, 201401, 2013 - 12 - 20 16:15:00, -57.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -3.0531756748605074, -6.09449776762736, -81.813098652517, 358.9166852326191, -1804.3521424785042, -5898.102746139386, -16523.247467602414, -1126.1245777124357, 1900.1325405972727, -1804.3521424785042, -11011218.8859, 1.37646, SYM +25, 201401, 2013 - 12 - 20 16:15:00, -68.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -1.5872851588068868, -3.496195967997979, -49.430989425942364, 228.39643323148874, -875.6613494405268, -3066.306020695293, -9478.797659311334, -680.3977970523262, 1209.1482864839038, -875.6613494405268, -13136190.9516, 1.37646, SYM +26, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -2.7617527193069247, -2.1570253859950568, -196.90829164509134, 41.66775064134809, -3904.7395095720058, -5335.133982634753, -5848.072409840642, -2710.3638711780245, 220.5922771068846, -3904.7395095720058, -3670406.2953000003, 1.37646, SYM +27, 201401, 2013 - 12 - 20 16:15:00, -91.0, GEE1, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -0.9314995053921319, -2.1848488680026357, -32.613007987852654, 159.091566181433, -492.21035339902915, -1799.464025610588, -5923.5067271790795, -448.9050097495967, 842.2429891772894, -492.21035339902915, -17579314.3617, 1.37646, SYM +28, 201401, 2013 - 12 - 20 16:15:00, -117.0, GEE1, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -0.5377848784658282, -1.3124323866768368, -20.678816896931302, 106.47840219731049, -276.9088034867481, -1038.8889491779587, -3558.233333802638, -284.63564305950064, 563.7048518788846, -276.9088034867481, -22601975.6079, 1.37646, SYM +29, 201401, 2013 - 12 - 20 16:15:00, -126.0, GEE1, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.26832064322603966, -0.6709491429253489, -11.141905573158631, 60.46674715056331, -136.19230235211526, -518.3398831872638, -1819.0602654117067, -153.3638734522993, 320.1156107033245, -136.19230235211526, -24340589.1162, 1.37646, SYM +30, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 2.2679019656760215, -1.5402996063132879, -171.8909083627291, 44.46660186641729, -3799.766367226869, 4381.11665891606, -4176.021148871998, -2366.009597249621, 235.40961078864902, -3799.766367226869, -3670406.2953000003, 1.37646, SYM +31, 201401, 2013 - 12 - 20 16:15:00, -64.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 1.3837331253029816, -2.8235204746509086, -43.612485371394655, 220.14926568363455, -830.2832188104537, 2673.0856705932674, -7655.056956508147, -600.3084161430988, 1165.48714708806, -830.2832188104537, -12363473.8368, 1.37646, SYM +32, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 7.2020735370616356, -4.563602018563462, -497.122018578484, 125.52799065210918, -12616.751505337697, 13912.913710339246, -12372.721817523941, -6842.685736925402, 664.5548385115469, -12616.751505337697, -8693067.5415, 1.37646, SYM +33, 201401, 2013 - 12 - 20 16:15:00, -51.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 2.3437688432249923, -4.451214679856759, -64.77831154926095, 308.08227675311343, -1473.5755257323203, 4527.675745737374, -12068.020120931302, -891.6475471509574, 1631.011271767656, -1473.5755257323203, -9852143.2137, 1.37646, SYM +34, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 8.458605232749587, -4.940764901230125, -525.443060379633, 129.5283894457786, -15593.349966086193, 16340.27257670611, -13395.276240137457, -7232.5135489014965, 685.733257448217, -15593.349966086193, -7727171.148, 1.37646, SYM +35, 201401, 2013 - 12 - 20 16:15:00, -98.0, GEE1, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 9.4826053465025, -16.10607458346713, -221.20512307927052, 992.855008410935, -6418.414454675487, 18318.42861034117, -43666.38010565609, -3044.8000371369267, 5256.250787989675, -6418.414454675487, -18931569.312599998, 1.37646, SYM +36, 201401, 2013 - 12 - 20 16:15:00, -111.0, GEE1, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 21.64196576263292, -30.912353747735946, -403.18410174016645, 1718.5349671207687, -16399.10487991298, 41807.79335675523, -83808.78790259302, -5549.667886812695, 9098.05631093482, -16399.10487991298, -21442899.9357, 1.37646, SYM +37, 201401, 2013 - 12 - 20 16:15:00, -108.0, GEE1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 38.577616140335834, -42.29198212484704, -530.9065709717439, 2178.0041395665626, -34460.58494238685, 74523.96059955555, -114660.94716715509, -7307.7165867976655, 11530.52145364535, -34460.58494238685, -20863362.0996, 1.37646, SYM +38, 201401, 2013 - 12 - 20 16:15:00, -83.0, GEE1, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 62.608160287492026, -27.652199931687655, -344.28364280730375, 1400.746222046674, -90513.99446933273, 120945.99245071695, -74969.94172708844, -4738.926629785414, 7415.658249224481, -15682.746569332587, -16033880.132100001, 1.37646, SYM +39, 201401, 2013 - 12 - 20 16:15:00, -56.0, GEE1, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 49.30385789013216, -11.468328655950843, -147.21372383587493, 617.4615184526685, -93429.26236862202, 95244.83704343032, -31092.641206404707, -2026.3380231112837, 3268.888775728308, -4399.8295686219335, -10818039.607199999, 1.37646, SYM""" + csv = StringIO(csv) + df = read_csv(csv).set_index(['index']) + for _ in range(100): + library.write('pandas', df) + result = library.read('pandas').data + assert len(result) == len(df) + assert np.all(df.values == result.values) + assert np.all(df.columns.values == result.columns.values) + + +def test_append_after_truncate_after_append(library): + columns = ['MAIN_UPPER', 'MAIN_LOWER', 'AUX_UPPER', 'AUX_LOWER', 'TARGET_HEDGE_POSITION'] + empty_df = DataFrame(columns=columns, dtype=np.float64) + library.write('sym', empty_df) + full_df = DataFrame(data=[np.zeros(5)], columns=columns) + library.write('sym', full_df) + library.write('sym', empty_df) + full_df = DataFrame(data=[np.zeros(5)], columns=columns) + library.write('sym', full_df) + assert len(library.read('sym', 1).data) == 0 + assert len(library.read('sym', 2).data) == 1 + assert len(library.read('sym', 3).data) == 0 + assert len(library.read('sym', 4).data) == 1 + + +def test_can_write_pandas_df_with_object_columns(library): + expected = DataFrame(data=dict(A=['a', 'b', None, 'c'], B=[1., 2., 3., 4.]), index=range(4)) + library.write('objects', expected) + saved_df = library.read('objects').data + + assert_frame_equal(saved_df, expected) + + +def panel(i1, i2, i3): + return Panel(np.random.randn(i1, i2, i3), range(i1), ['A%d' % i for i in range(i2)], + list(rrule(DAILY, count=i3, dtstart=dt(1970, 1, 1), interval=1))) + + +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([1, 2, 4], r=3))) +def test_panel_save_read(library, df_size): + '''Note - empties are not tested here as they don't work!''' + pn = panel(*df_size) + library.write('pandas', pn) + result = library.read('pandas').data + assert np.all(pn.values == result.values), str(pn.values) + "!=" + str(result.values) + for i in range(3): + assert np.all(pn.axes[i] == result.axes[i]) + if None not in pn.axes[i].names: + assert np.all(pn.axes[i].names == result.axes[i].names), \ + str(pn.axes[i].names) + "!=" + str(pn.axes[i].names) + + +def test_save_read_ints(library): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data={'col1':np.arange(5), 'col2':np.arange(5)}) + ts1.index.name = 'index' + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_save_read_datetimes(library): + # FEF symbols have datetimes in the CLOSE_REVISION field. Handle specially. + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(3)], + data={'field1': [1, 2, 3], + 'revision': [dt(2013, 1, 1), dt(2013, 1, 2), dt(2013, 1, 3)], + 'field2': [4, 5, 6]}, + ) + ts1.index.name = 'index' + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_labels(library): + ts1 = DataFrame(index=[dt(2012, 1, 1), dt(2012, 1, 2)], + data={'data': [1., 2.]}) + ts1.index.name = 'some_index' + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_duplicate_labels(library): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data=[[np.arange(5), np.arange(5, 10)]], + columns=['a', 'a'] + ) + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_no_labels(library): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data=[[np.arange(5), np.arange(5, 10)]]) + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +@pytest.mark.xfail(reason='needs investigating') +def test_no_index_labels(library): + ts1 = DataFrame(index=[dt(2012, 1, 1), dt(2012, 1, 2)], + data={'data': [1., 2.]}) + library.write('TEST_1', ts1) + ts2 = library.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_not_unique(library): + d = dt.now() + ts = DataFrame(index=[d, d], data={'near': [1., 2.]}) + ts.index.name = 'index' + library.write('ts' , ts) + ts2 = library.read('ts').data + assert_frame_equal(ts, ts2) diff --git a/tests/integration/store/test_pickle_store.py b/tests/integration/store/test_pickle_store.py new file mode 100644 index 000000000..f3f27733b --- /dev/null +++ b/tests/integration/store/test_pickle_store.py @@ -0,0 +1,94 @@ +import bson +from datetime import datetime as dt, timedelta +from mock import patch +import numpy as np +import re + +from arctic.arctic import Arctic + + +def test_save_read_bson(library): + blob = {'foo': dt(2015, 1, 1), 'bar': ['a', 'b', ['x', 'y', 'z']]} + library.write('BLOB', blob) + saved_blob = library.read('BLOB').data + assert blob == saved_blob + + +def test_save_read_bson_object(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic} + library.write('BLOB', blob) + saved_blob = library.read('BLOB').data + assert blob == saved_blob + + +def test_get_info_bson_object(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic} + library.write('BLOB', blob) + assert library._get_info('BLOB').startswith('Handler: PickleStore') + + +def test_bson_large_object(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic, + 'large_thing': np.random.rand(2.1 * 1024 * 1024).tostring()} + assert len(blob['large_thing']) > 16 * 1024 * 1024 + library.write('BLOB', blob) + saved_blob = library.read('BLOB').data + assert blob == saved_blob + + +def test_bson_leak_objects_delete(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic} + library.write('BLOB', blob) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 1 + library.delete('BLOB') + assert library._collection.count() == 0 + assert library._collection.versions.count() == 0 + + +def test_bson_leak_objects_prune_previous(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic} + + yesterday = dt.utcnow() - timedelta(days=1, seconds=1) + _id = bson.ObjectId.from_datetime(yesterday) + with patch("bson.ObjectId", return_value=_id): + library.write('BLOB', blob) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 1 + + _id = bson.ObjectId.from_datetime(dt.utcnow() - timedelta(minutes=130)) + with patch("bson.ObjectId", return_value=_id): + library.write('BLOB', {}, prune_previous_version=False) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 2 + + # This write should pruned the oldest version in the chunk collection + library.write('BLOB', {}) + assert library._collection.count() == 0 + assert library._collection.versions.count() == 2 + + +def test_prune_previous_doesnt_kill_other_objects(library): + blob = {'foo': dt(2015, 1, 1), 'object': Arctic} + + yesterday = dt.utcnow() - timedelta(days=1, seconds=1) + _id = bson.ObjectId.from_datetime(yesterday) + with patch("bson.ObjectId", return_value=_id): + library.write('BLOB', blob, prune_previous_version=False) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 1 + + _id = bson.ObjectId.from_datetime(dt.utcnow() - timedelta(hours=10)) + with patch("bson.ObjectId", return_value=_id): + library.write('BLOB', blob, prune_previous_version=False) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 2 + + # This write should pruned the oldest version in the chunk collection + library.write('BLOB', {}) + assert library._collection.count() == 1 + assert library._collection.versions.count() == 2 + + library._delete_version('BLOB', 2) + assert library._collection.count() == 0 + assert library._collection.versions.count() == 1 diff --git a/tests/integration/store/test_version_store.py b/tests/integration/store/test_version_store.py new file mode 100644 index 000000000..dd2ed7291 --- /dev/null +++ b/tests/integration/store/test_version_store.py @@ -0,0 +1,858 @@ +import bson +from bson.son import SON +from datetime import datetime as dt, timedelta as dtd +import pandas as pd +from pandas.util.testing import assert_frame_equal +from pymongo.errors import OperationFailure +from pymongo.read_preferences import ReadPreference +from pymongo.server_type import SERVER_TYPE +from datetime import datetime +from mock import patch +import time +import pytest + +from arctic.exceptions import NoDataFoundException, DuplicateSnapshotException + +from ...util import read_str_as_pandas + + +ts1 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 2.0 + 2012-10-09 17:06:11.040 | 2.5 + 2012-11-08 17:06:11.040 | 3.0""") + +ts2 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 4.0 + 2012-10-09 17:06:11.040 | 4.5 + 2012-10-10 17:06:11.040 | 5.0 + 2012-11-08 17:06:11.040 | 3.0""") + +ts1_append = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 2.0 + 2012-10-09 17:06:11.040 | 2.5 + 2012-11-08 17:06:11.040 | 3.0 + 2012-11-09 17:06:11.040 | 3.0""") + + +symbol = 'TS1' + + +from pymongo.cursor import _QUERY_OPTIONS +from pymongo.message import query as __query +def _query(allow_secondary, library_name): + def _internal_query(options, *args, **kwargs): + coll_name = args[0] + data_coll_name = 'arctic_{}'.format(library_name) + versions_coll_name = data_coll_name + '.versions' + if allow_secondary and coll_name in (data_coll_name, versions_coll_name): + # Reads to the Version and Chunks collections are allowed to slaves + assert bool(options & _QUERY_OPTIONS['slave_okay']) == allow_secondary, "{}: options:{}".format(coll_name, options) + elif '.$cmd' not in coll_name: + # All other collections we force PRIMARY read. + assert bool(options & _QUERY_OPTIONS['slave_okay']) == False, "{}: options:{}".format(coll_name, options) + return __query(options, *args, **kwargs) + return _internal_query + + +# MongoDB always sets slaveOk when talking to a single server. +# Pretend we're a mongos for the tests that care... +# +# A _Query's slaveOk bit is already set for queries with non-primary +# read preference. If this is a direct connection to a mongod, override +# and *always* set the slaveOk bit. See bullet point 2 in +# server-selection.rst#topology-type-single. +# set_slave_ok = ( +# topology.description.topology_type == TOPOLOGY_TYPE.Single +# and server.description.server_type != SERVER_TYPE.Mongos) + + +def test_store_item_new_version(library, library_name): + with patch('pymongo.message.query', side_effect=_query(False, library_name)), \ + patch('pymongo.server_description.ServerDescription.server_type', SERVER_TYPE.Mongos): + library.write(symbol, ts1) + coll = library._collection + count = coll.count() + assert coll.versions.count() == 1 + + # No change to the TS + library.write(symbol, ts1, prune_previous_version=False) + assert coll.count() == count + assert coll.versions.count() == 2 + + +def test_store_item_read_preference(library_secondary, library_name): + with patch('arctic.arctic.ArcticLibraryBinding.check_quota'), \ + patch('pymongo.message.query', side_effect=_query(False, library_name)) as query, \ + patch('pymongo.server_description.ServerDescription.server_type', SERVER_TYPE.Mongos): + # write an item + library_secondary.write(symbol, ts1) + library_secondary.write(symbol, ts1_append, prune_previous_version=False) + # delete an individual version + library_secondary._delete_version(symbol, 1) + # delete the item entirely + library_secondary.delete(symbol) + assert query.call_count > 0 + + +def test_read_item_read_preference_SECONDARY(library_secondary, library_name): + # write an item + library_secondary.write(symbol, ts1) + with patch('pymongo.message.query', side_effect=_query(True, library_name)) as query, \ + patch('pymongo.server_description.ServerDescription.server_type', SERVER_TYPE.Mongos): + library_secondary.read(symbol) + assert query.call_count > 0 + + +def test_query_falls_back_to_primary(library_secondary, library_name): + allow_secondary = [True] + def _query(options, *args, **kwargs): + # If we're allowing secondary read then raise when reading a chunk. + # We should attempt a call to primary only subsequently. + if args[0] == 'arctic_{}'.format(library_name) and \ + bool(options & _QUERY_OPTIONS['slave_okay']) == True: + allow_secondary[0] = False + raise OperationFailure("some_error") + return __query(options, *args, **kwargs) + + library_secondary.write(symbol, ts1) + with patch('pymongo.message.query', side_effect=_query), \ + patch('pymongo.server_description.ServerDescription.server_type', SERVER_TYPE.Mongos): + assert library_secondary.read(symbol) is not None + # We raised at least once on a secondary read + assert allow_secondary[0] == False + + +def test_store_item_metadata(library): + library.write(symbol, ts1, metadata={'key': 'value'}) + + after = library.read(symbol) + + assert after.metadata['key'] == 'value' + assert after.version + assert_frame_equal(after.data, ts1) + + +def test_read_metadata(library): + library.write(symbol, ts1, metadata={'key': 'value'}) + + after = library.read_metadata(symbol) + + assert after.metadata['key'] == 'value' + assert after.version + assert after.data is None + + +def test_read_metadata_throws_on_deleted_symbol(library): + library.write(symbol, ts1, metadata={'key': 'value'}) + library.delete(symbol) + + with pytest.raises(NoDataFoundException): + library.read_metadata(symbol) + + +def test_store_item_and_update(library): + coll = library._collection + + # Store the first timeseries + none = datetime.now() + time.sleep(1) + library.write(symbol, ts1) + original = datetime.now() + + # Assertions: + assert coll.versions.count() == 1 + assert_frame_equal(library.read(symbol).data, ts1) + + # Update the TimeSeries + time.sleep(1) + library.write(symbol, ts2, prune_previous_version=False) + recent = datetime.now() + + assert coll.versions.count() == 2 + assert_frame_equal(library.read(symbol).data, ts2) + + # Get the different versions of the DB + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of=none) + assert_frame_equal(library.read(symbol, as_of=original).data, ts1) + assert_frame_equal(library.read(symbol, as_of=recent).data, ts2) + + # Now push back in the original version + time.sleep(1) + library.write(symbol, ts1, prune_previous_version=False) + + assert coll.versions.count() == 3 + assert_frame_equal(library.read(symbol).data, ts1) + + # Get the different versions of the DB + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of=none) + assert_frame_equal(library.read(symbol, as_of=original).data, ts1) + assert_frame_equal(library.read(symbol, as_of=recent).data, ts2) + assert_frame_equal(library.read(symbol, as_of=datetime.now()).data, ts1) + + +def test_append_update(library): + library.write(symbol, ts1) + library.snapshot('snap') + + coll = library._collection + + # Assertions: + assert coll.versions.count() == 1 + assert_frame_equal(library.read(symbol).data, ts1) + + # Append an item + dts = list(ts1.index) + dts.append(dts[-1] + dtd(days=1)) + values = list(ts1.near.values) + values.append(47.) + ts2 = pd.DataFrame(index=dts, data=values, columns=ts1.columns) + ts2.index.name = ts1.index.name + + # Saving ts2 shouldn't create any new chunks. Instead it should + # reuse the last chunk. + library.write(symbol, ts2, prune_previous_version=False) + assert coll.versions.count() == 2 + assert_frame_equal(library.read(symbol, as_of='snap').data, ts1) + assert_frame_equal(library.read(symbol).data, ts2) + + # We should be able to save a smaller timeseries too + # This isn't likely to happen, so we don't care too much about space saving + # just make sure we get it right. + library.write(symbol, ts1, prune_previous_version=False) + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + assert_frame_equal(library.read(symbol, as_of=3).data, ts1) + + # Append an item, and add a whole new chunk + dts = list(ts2.index) + dts.append(dts[-1] + dtd(days=1)) + dts.append(dts[-1] + dtd(days=40)) + values = list(ts2.near.values) + values.append(47.) + values.append(53.) + ts3 = pd.DataFrame(index=dts, data=values, columns=ts1.columns) + ts3.index.name = ts1.index.name + + library.write(symbol, ts3, prune_previous_version=False) + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + assert_frame_equal(library.read(symbol, as_of=3).data, ts1) + assert_frame_equal(library.read(symbol, as_of=4).data, ts3) + + library.write(symbol, ts3, prune_previous_version=False) + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + assert_frame_equal(library.read(symbol, as_of=3).data, ts1) + assert_frame_equal(library.read(symbol, as_of=4).data, ts3) + assert_frame_equal(library.read(symbol, as_of=5).data, ts3) + + +def test_append(library): + library.append(symbol, ts1, upsert=True) + library.append(symbol, ts1_append, upsert=True) + assert len(library.read(symbol).data) == len(ts1) + len(ts1_append) + + +def test_append_should_overwrite_after_delete(library): + library.append(symbol, ts1, upsert=True) + library.append(symbol, ts1_append, upsert=True) + assert len(library.read(symbol).data) == len(ts1) + len(ts1_append) + library.delete(symbol) + library.append(symbol, ts1_append, upsert=True) + assert len(library.read(symbol).data) == len(ts1_append) + + +def test_append_empty_ts(library): + library.append(symbol, ts1, upsert=True) + library.append(symbol, pd.DataFrame(), upsert=True) + assert len(library.read(symbol).data) == len(ts1) + + +def test_query_version_as_of_int(library): + # Store the first timeseries + library.write(symbol, ts1) + library.write(symbol, ts2, prune_previous_version=False) + + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol).data, ts2) + + +def test_list_version(library): + assert len(list(library.list_versions(symbol))) == 0 + dates = [None, None, None] + now = dt.utcnow() + for x in xrange(len(dates)): + dates[x] = now - dtd(minutes=130 - x) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(dates[x])): + library.write(symbol, ts1, prune_previous_version=False) + assert len(list(library.list_versions(symbol))) == 3 + + library.write(symbol, ts1, prune_previous_version=True) + assert len(list(library.list_versions(symbol))) >= 2 + + versions = list(library.list_versions(symbol)) + for i, x in enumerate([4, 3]): + assert versions[i]['symbol'] == symbol + assert versions[i]['date'] >= dates[i] + assert versions[i]['version'] == x + + +def test_list_version_latest_only(library): + assert len(list(library.list_versions(symbol))) == 0 + dates = [None, None, None] + now = dt.utcnow() + for x in xrange(len(dates)): + dates[x] = now - dtd(minutes=20 - x) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(dates[x])): + library.write(symbol, ts1, prune_previous_version=False) + assert len(list(library.list_versions(symbol))) == 3 + + library.write(symbol, ts1, prune_previous_version=True) + assert len(list(library.list_versions(symbol, latest_only=True))) == 1 + + versions = list(library.list_versions(symbol)) + for i, x in enumerate([4, ]): + assert versions[i]['symbol'] == symbol + assert versions[i]['date'] >= dates[i] + assert versions[i]['version'] == x + + +def test_list_version_snapshot(library): + library.write('A', ts1) + library.snapshot('one') + library.write('B', ts2) + library.snapshot('two') + library.write('A', ts2) + library.snapshot('three') + library.write('C', ts2) + + assert set(x['symbol'] for x in library.list_versions()) \ + == set(['A', 'B', 'C']) + + assert set(x['symbol'] for x in library.list_versions(snapshot='one')) \ + == set(['A']) + + assert set(x['symbol'] for x in library.list_versions(snapshot='two')) \ + == set(['A', 'B']) + + assert set(x['symbol'] for x in library.list_versions(snapshot='three')) \ + == set(['A', 'B']) + + assert [x['snapshots'] for x in library.list_versions(symbol='A')] \ + == [['three', ], ['one', 'two']] + + assert [x['snapshots'] for x in library.list_versions(symbol='B')] \ + == [['two', 'three']] + + assert all('parent' not in x for x in library.list_versions(symbol='C')) + + +def test_delete_versions(library): + library.write(symbol, ts1) + library.write(symbol, ts2, prune_previous_version=False) + library.write(symbol, ts1, prune_previous_version=False) + library.write(symbol, ts2, prune_previous_version=False) + + coll = library._collection + + # Delete version 1 (ts1) + library._delete_version(symbol, 1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + assert_frame_equal(library.read(symbol, as_of=3).data, ts1) + + library._delete_version(symbol, 2) + assert_frame_equal(library.read(symbol, as_of=3).data, ts1) + assert_frame_equal(library.read(symbol, as_of=4).data, ts2) + + library._delete_version(symbol, 3) + assert_frame_equal(library.read(symbol).data, ts2) + + library._delete_version(symbol, 4) + assert coll.count() == 0 + + +def test_delete_bson_versions(library): + coll = library._collection + + a = [{'a':'b'}] + c = [{'c':'d'}] + library.write(symbol, a) + library.write(symbol, c, prune_previous_version=False) + library.write(symbol, a, prune_previous_version=False) + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 4 + + library._delete_version(symbol, 1) + assert library.read(symbol, as_of=2).data == c + assert library.read(symbol, as_of=3).data == a + assert coll.versions.count() == 3 + + library._delete_version(symbol, 2) + assert library.read(symbol, as_of=3).data == a + assert library.read(symbol, as_of=4).data == c + assert coll.versions.count() == 2 + + library._delete_version(symbol, 3) + assert coll.versions.count() == 1 + assert library.read(symbol).data == c + + library._delete_version(symbol, 4) + assert coll.versions.count() == 0 + + +def test_delete_item_has_symbol(library): + library.write(symbol, ts1) + library.write(symbol, ts2, prune_previous_version=False) + library.write(symbol, ts1, prune_previous_version=False) + library.write(symbol, ts2, prune_previous_version=False) + + library.delete(symbol) + for version in (1, 2, 3, 4, None): + with pytest.raises(NoDataFoundException): + library.read(symbol, version) + + # Has symbol returns false - this should really be has_data + assert not library.has_symbol(symbol) + assert symbol not in library.list_symbols() + assert [x['version'] for x in library.list_versions(symbol)] == [] + + +def test_delete_item_snapshot(library): + library.write(symbol, ts1) + library.write(symbol, ts2, prune_previous_version=False) + library.write(symbol, ts1, prune_previous_version=False) + library.snapshot('snap') + library.write(symbol, ts2, prune_previous_version=False) + + library.delete(symbol) + + for version in (1, 2, 4, None): + with pytest.raises(NoDataFoundException): + library.read(symbol, version) + + # Can get the version out of the snapshots + assert_frame_equal(library.read(symbol, 'snap').data, ts1) + assert_frame_equal(library.read(symbol, 3).data, ts1) + + assert not library.has_symbol(symbol) + assert not library.has_symbol(symbol, as_of=2) + assert library.has_symbol(symbol, as_of=3) + assert symbol in library.list_symbols(all_symbols=True) + assert symbol in library.list_symbols(snapshot='snap') + assert symbol not in library.list_symbols() + assert sorted([x['version'] for x in library.list_versions(symbol)]) == [3, 5] + + # Should be able to create another snapshot + library.snapshot('snap2') + with pytest.raises(NoDataFoundException): + library.read(symbol, 'snap2') + assert_frame_equal(library.read(symbol, 'snap').data, ts1) + assert symbol in library.list_symbols(snapshot='snap') + assert symbol not in library.list_symbols(snapshot='snap2') + + +def test_has_symbol(library): + assert not library.has_symbol(symbol) + library.write(symbol, ts1) + assert library.has_symbol(symbol) + + +def test_snapshot(library): + library.write(symbol, ts1) + library.snapshot('current') + library.write(symbol, ts2) + assert_frame_equal(library.read(symbol, as_of='current').data, ts1) + assert_frame_equal(library.read(symbol).data, ts2) + versions = library.list_versions(symbol) + assert versions[0]['snapshots'] == [] + assert versions[1]['snapshots'] == ['current'] + + library.snapshot('new') + assert_frame_equal(library.read(symbol, as_of='current').data, ts1) + assert_frame_equal(library.read(symbol, as_of='new').data, ts2) + assert_frame_equal(library.read(symbol).data, ts2) + versions = library.list_versions(symbol) + assert versions[0]['snapshots'] == ['new'] + assert versions[1]['snapshots'] == ['current'] + + # Replace the current version, and the snapshot shouldn't be deleted + library.write(symbol, ts1, prune_previous_version=True) + assert_frame_equal(library.read(symbol, as_of='current').data, ts1) + assert_frame_equal(library.read(symbol, as_of='new').data, ts2) + assert_frame_equal(library.read(symbol).data, ts1) + versions = library.list_versions(symbol) + assert versions[0]['snapshots'] == [] + assert versions[1]['snapshots'] == ['new'] + assert versions[2]['snapshots'] == ['current'] + + +def test_snapshot_exclusion(library): + library.write(symbol, ts1) + library.snapshot('current', skip_symbols=[symbol]) + versions = list(library.list_versions(symbol)) + assert len(versions) == 1 + assert versions[0]['snapshots'] == [] + + +def test_snapshot_delete(library): + library.write(symbol, ts1) + library.snapshot('current') + library.write(symbol, ts2) + + # We have two versions of the symbol + assert len(list(library.list_versions(symbol))) == 2 + library.delete_snapshot('current') + # Data no longer referenced by snapshot + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of='current') + # But still accessible through the version + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + + # Snapshot again + library.snapshot('current') + library.write(symbol, ts1) + assert_frame_equal(library.read(symbol, as_of='current').data, ts2) + + +def test_multiple_snapshots(library): + library.write(symbol, ts1) + library.snapshot('current') + library.write(symbol, ts2) + library.snapshot('current2') + + assert 'current' in library.list_snapshots() + assert 'current2' in library.list_snapshots() + + assert_frame_equal(library.read(symbol).data, ts2) + assert_frame_equal(library.read(symbol, as_of=1).data, ts1) + assert_frame_equal(library.read(symbol, as_of=2).data, ts2) + assert_frame_equal(library.read(symbol, as_of='current').data, ts1) + assert_frame_equal(library.read(symbol, as_of='current2').data, ts2) + + library.delete_snapshot('current') + assert_frame_equal(library.read(symbol, as_of='current2').data, ts2) + library.delete_snapshot('current2') + assert len(list(library.list_versions(symbol))) == 2 + + +def test_delete_identical_snapshots(library): + library.write(symbol, ts1) + library.snapshot('current1') + library.snapshot('current2') + library.snapshot('current3') + + library.delete_snapshot('current3') + assert_frame_equal(library.read(symbol, as_of='current2').data, ts1) + library.delete_snapshot('current1') + assert_frame_equal(library.read(symbol, as_of='current2').data, ts1) + assert_frame_equal(library.read(symbol).data, ts1) + + +def test_list_snapshots(library): + library.write(symbol, ts1) + library.snapshot('current') + library.snapshot('current2') + + assert 'current' in library.list_snapshots() + assert 'current2' in library.list_snapshots() + + +def test_duplicate_snapshots(library): + library.write(symbol, ts1) + library.snapshot('current') + with pytest.raises(DuplicateSnapshotException): + library.snapshot('current') + + +def test_prunes_multiple_versions(library): + coll = library._collection + + a = [{'a':'b'}] + c = [{'c':'d'}] + # Create an ObjectId + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 4 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, a, prune_previous_version=True) + assert coll.versions.count() == 3 + assert library.read(symbol, as_of=3).data == a + assert library.read(symbol, as_of=4).data == c + assert library.read(symbol, as_of=5).data == a + + +def test_prunes_doesnt_prune_snapshots(library): + coll = library._collection + + a = [{'a':'b'}] + c = [{'c':'d'}] + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + library.snapshot('snap') + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 4 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, a, prune_previous_version=True) + assert coll.versions.count() == 4 + assert library.read(symbol, as_of='snap').data == c + assert library.read(symbol, as_of=3).data == a + assert library.read(symbol, as_of=4).data == c + assert library.read(symbol, as_of=5).data == a + + # Remove the snapshot, the version should now be pruned + library.delete_snapshot('snap') + assert coll.versions.count() == 4 + library.write(symbol, c, prune_previous_version=True) + assert coll.versions.count() == 4 + assert library.read(symbol, as_of=4).data == c + assert library.read(symbol, as_of=5).data == a + assert library.read(symbol, as_of=6).data == c + + +def test_prunes_multiple_versions_ts(library): + coll = library._collection + + a = ts1 + c = ts2 + # Create an ObjectId + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 4 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, a, prune_previous_version=True) + assert coll.versions.count() == 3 + assert_frame_equal(library.read(symbol, as_of=3).data, a) + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, a) + + +def test_prunes_doesnt_prune_snapshots_ts(library): + coll = library._collection + + a = ts1 + c = ts2 + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + library.snapshot('snap') + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 4 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, a, prune_previous_version=True) + assert coll.versions.count() == 4 + assert_frame_equal(library.read(symbol, as_of='snap').data, c) + assert_frame_equal(library.read(symbol, as_of=3).data, a) + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, a) + + # Remove the snapshot, the version should now be pruned + library.delete_snapshot('snap') + assert coll.versions.count() == 4 + library.write(symbol, c, prune_previous_version=True) + assert coll.versions.count() == 4 + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, a) + assert_frame_equal(library.read(symbol, as_of=6).data, c) + + +def test_prunes_multiple_versions_fully_different_tss(library): + coll = library._collection + + a = ts1 + b = ts2 + c = b.copy() + c.index = [i + dtd(days=365) for i in c.index] + c.index.name = b.index.name + # Create an ObjectId + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=124))): + library.write(symbol, b, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + # a b and c versions above will be pruned a and b share months + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, c, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 5 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, c, prune_previous_version=True) + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, c) + assert_frame_equal(library.read(symbol, as_of=6).data, c) + + +def test_prunes_doesnt_prune_snapshots_fully_different_tss(library): + coll = library._collection + + a = ts1 + b = ts2 + c = b.copy() + c.index = [i + dtd(days=365) for i in c.index] + c.index.name = b.index.name + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=125))): + library.write(symbol, a, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=123))): + library.write(symbol, b, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=122))): + library.write(symbol, c, prune_previous_version=False) + library.snapshot('snap') + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=121))): + library.write(symbol, c, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=118))): + library.write(symbol, c, prune_previous_version=False) + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=119))): + library.write(symbol, c, prune_previous_version=False) + assert coll.versions.count() == 6 + + # Prunes all versions older than the most recent version that's older than 10 mins + library.write(symbol, c, prune_previous_version=True) + assert coll.versions.count() == 5 + assert_frame_equal(library.read(symbol, as_of='snap').data, c) + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, c) + assert_frame_equal(library.read(symbol, as_of=6).data, c) + assert_frame_equal(library.read(symbol, as_of=7).data, c) + + library.delete_snapshot('snap') + assert coll.versions.count() == 5 + library.write(symbol, c, prune_previous_version=True) + assert_frame_equal(library.read(symbol, as_of=4).data, c) + assert_frame_equal(library.read(symbol, as_of=5).data, c) + assert_frame_equal(library.read(symbol, as_of=6).data, c) + assert_frame_equal(library.read(symbol, as_of=7).data, c) + + +def test_prunes_previous_version_append_interaction(library): + ts = ts1 + ts2 = ts1.append(pd.DataFrame(index=[ts.index[-1] + dtd(days=1), + ts.index[-1] + dtd(days=2), ], + data=[3.7, 3.8], + columns=['near'])) + ts2.index.name = ts1.index.name + ts3 = ts.append(pd.DataFrame(index=[ts2.index[-1] + dtd(days=1), + ts2.index[-1] + dtd(days=2)], + data=[4.8, 4.9], + columns=['near'])) + ts3.index.name = ts1.index.name + ts4 = ts + ts5 = ts2 + ts6 = ts3 + now = dt.utcnow() + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=130)), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts, prune_previous_version=False) + assert_frame_equal(ts, library.read(symbol).data) + + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=129)), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts2, prune_previous_version=False) + assert_frame_equal(ts, library.read(symbol, as_of=1).data) + assert_frame_equal(ts2, library.read(symbol).data) + + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=128)), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts3, prune_previous_version=False) + assert_frame_equal(ts, library.read(symbol, as_of=1).data) + assert_frame_equal(ts2, library.read(symbol, as_of=2).data) + assert_frame_equal(ts3, library.read(symbol).data) + + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=127)), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts4, prune_previous_version=False) + assert_frame_equal(ts, library.read(symbol, as_of=1).data) + assert_frame_equal(ts2, library.read(symbol, as_of=2).data) + assert_frame_equal(ts3, library.read(symbol, as_of=3).data) + assert_frame_equal(ts4, library.read(symbol).data) + + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now - dtd(minutes=126)), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts5, prune_previous_version=False) + assert_frame_equal(ts, library.read(symbol, as_of=1).data) + assert_frame_equal(ts2, library.read(symbol, as_of=2).data) + assert_frame_equal(ts3, library.read(symbol, as_of=3).data) + assert_frame_equal(ts4, library.read(symbol, as_of=4).data) + assert_frame_equal(ts5, library.read(symbol).data) + + with patch("bson.ObjectId", return_value=bson.ObjectId.from_datetime(now), + from_datetime=bson.ObjectId.from_datetime): + library.write(symbol, ts6, prune_previous_version=True) + + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of=1) + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of=2) + with pytest.raises(NoDataFoundException): + library.read(symbol, as_of=3) + assert_frame_equal(ts5, library.read(symbol, as_of=5).data) + assert_frame_equal(ts6, library.read(symbol).data) + + +def test_list_symbols(library): + library.snapshot('snap1') + library.write('asdf', {'foo':'bar'}, metadata={'a':1, 'b':10}) + library.snapshot('snap2') + assert 'asdf' in library.list_symbols() + assert 'asdf' not in library.list_symbols(snapshot='snap1') + assert 'asdf' in library.list_symbols(snapshot='snap2') + assert 'asdf' in library.list_symbols(all_symbols=True) + assert 'asdf' in library.list_symbols(a=1) + assert library.list_symbols(a={'$gt': 5}) == [] + assert library.list_symbols(b={'$gt': 5}) == ['asdf'] + + +def test_list_symbols_regex(library): + library.snapshot('snap1') + library.write('asdf', {'foo':'bar'}, metadata={'a':1, 'b':10}) + library.write('furble', {'foo':'bar'}, metadata={'a':1, 'b':10}) + library.snapshot('snap2') + assert 'asdf' in library.list_symbols(regex='asd') + assert 'furble' not in library.list_symbols(regex='asd') + assert 'asdf' not in library.list_symbols(snapshot='snap1', regex='asd') + assert 'asdf' in library.list_symbols(snapshot='snap2', regex='asd') + assert 'furble' not in library.list_symbols(snapshot='snap2', regex='asd') + assert 'asdf' in library.list_symbols(all_symbols=True, regex='asd') + assert 'furble' not in library.list_symbols(all_symbols=True, regex='asd') + assert 'asdf' in library.list_symbols(a=1, regex='asd') + assert 'furble' not in library.list_symbols(a=1, regex='asd') + assert library.list_symbols(a={'$gt': 5}, regex='asd') == [] + assert library.list_symbols(b={'$gt': 5}, regex='asd') == ['asdf'] diff --git a/tests/integration/store/test_version_store_audit.py b/tests/integration/store/test_version_store_audit.py new file mode 100644 index 000000000..897f3d269 --- /dev/null +++ b/tests/integration/store/test_version_store_audit.py @@ -0,0 +1,215 @@ +from bson import ObjectId +from datetime import datetime as dt +from mock import patch +from pandas.util.testing import assert_frame_equal +from pymongo.errors import OperationFailure +import pytest + +from arctic.store.audit import ArcticTransaction +from arctic.exceptions import ConcurrentModificationException, NoDataFoundException + +from ...util import read_str_as_pandas + + +ts1 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 2.0 + 2012-10-09 17:06:11.040 | 2.5 + 2012-11-08 17:06:11.040 | 3.0""") + +ts2 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 4.0 + 2012-10-09 17:06:11.040 | 4.5 + 2012-10-10 17:06:11.040 | 5.0 + 2012-11-08 17:06:11.040 | 3.0""") + +ts3 = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 4.0 + 2012-10-09 17:06:11.040 | 4.5 + 2012-10-10 17:06:11.040 | 5.0 + 2012-11-08 17:06:11.040 | 3.0 + 2012-11-09 17:06:11.040 | 44.0""") + +ts1_append = read_str_as_pandas(""" times | near + 2012-09-08 17:06:11.040 | 1.0 + 2012-10-08 17:06:11.040 | 2.0 + 2012-10-09 17:06:11.040 | 2.5 + 2012-11-08 17:06:11.040 | 3.0 + 2012-11-09 17:06:11.040 | 3.0""") + +symbol = 'TS1' + + +def test_ArcticTransaction_can_do_first_writes(library): + with ArcticTransaction(library, 'SYMBOL_NOT_HERE', 'user', 'log') as cwb: + cwb.write('SYMBOL_NOT_HERE', ts1) + wrote_vi = library.read('SYMBOL_NOT_HERE') + assert_frame_equal(wrote_vi.data, ts1) + + +def test_ArcticTransaction_detects_concurrent_writes(library): + library.write('FOO', ts1) + + from threading import Event, Thread + e1 = Event() + e2 = Event() + + def losing_writer(): + #will attempt to write version 2, should find that version 2 is there and it ends up writing version 3 + with pytest.raises(ConcurrentModificationException): + with ArcticTransaction(library, 'FOO', 'user', 'log') as cwb: + cwb.write('FOO', ts1_append, metadata={'foo': 'bar'}) + e1.wait() + + def winning_writer(): + #will attempt to write version 2 as well + with ArcticTransaction(library, 'FOO', 'user', 'log') as cwb: + cwb.write('FOO', ts2, metadata={'foo': 'bar'}) + e2.wait() + + t1 = Thread(target=losing_writer) + t2 = Thread(target=winning_writer) + t1.start() + t2.start() + + # both read the same timeseries and are locked doing some 'work' + e2.set() + # t2 should now be able to finish + t2.join() + e1.set() + t1.join() + + # we're expecting the losing_writer to undo its write once it realises that it wrote v3 instead of v2 + wrote_vi = library.read('FOO') + assert_frame_equal(wrote_vi.data, ts2) + assert {'foo': 'bar'} == wrote_vi.metadata + + +def test_audit_writes(library): + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, ts1) + + with ArcticTransaction(library, symbol, 'u2', 'l2') as mt: + mt.write(symbol, ts2) + + audit_log = library.read_audit_log(symbol) + assert audit_log == [{u'new_v': 2, u'symbol': u'TS1', u'message': u'l2', u'user': u'u2', u'orig_v': 1}, + {u'new_v': 1, u'symbol': u'TS1', u'message': u'l1', u'user': u'u1', u'orig_v': 0}] + assert_frame_equal(ts1, library.read(symbol, audit_log[0]['orig_v']).data) + assert_frame_equal(ts2, library.read(symbol, audit_log[0]['new_v']).data) + + +def test_metadata_changes_writes(library): + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, ts1, metadata={'original': 'data'}) + + with ArcticTransaction(library, symbol, 'u2', 'l2') as mt: + mt.write(symbol, ts1, metadata={'some': 'data', 'original': 'data'}) + + audit_log = library.read_audit_log(symbol) + assert audit_log == [{u'new_v': 2, u'symbol': u'TS1', u'message': u'l2', u'user': u'u2', u'orig_v': 1}, + {u'new_v': 1, u'symbol': u'TS1', u'message': u'l1', u'user': u'u1', u'orig_v': 0}] + assert_frame_equal(ts1, library.read(symbol, audit_log[0]['orig_v']).data) + assert_frame_equal(ts1, library.read(symbol, audit_log[0]['new_v']).data) + + assert library.read(symbol, audit_log[0]['orig_v']).metadata == {'original': 'data'} + assert library.read(symbol, audit_log[0]['new_v']).metadata == {'some': 'data', 'original': 'data'} + + +def test_cleanup_orphaned_versions_integration(library): + _id = ObjectId.from_datetime(dt(2013, 1, 1)) + with patch('bson.ObjectId', return_value=_id): + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, ts1) + assert library._versions.find({'parent': {'$size': 1}}).count() == 1 + library._cleanup_orphaned_versions(False) + assert library._versions.find({'parent': {'$size': 1}}).count() == 1 + + +def test_corrupted_read_writes_new(library): + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, ts1) + + res = library.read(symbol) + assert res.version == 1 + + with ArcticTransaction(library, symbol, 'u1', 'l2') as mt: + mt.write(symbol, ts2) + + res = library.read(symbol) + assert res.version == 2 + + with patch.object(library, 'read') as l: + l.side_effect = OperationFailure('some failure') + with ArcticTransaction(library, symbol, 'u1', 'l2') as mt: + mt.write(symbol, ts3, metadata={'a': 1, 'b': 2}) + + res = library.read(symbol) + # Corrupted data still increments on write to next version correctly with new data + assert res.version == 3 + assert_frame_equal(ts3, library.read(symbol, 3).data) + assert res.metadata == {'a': 1, 'b': 2} + + with patch.object(library, 'read') as l: + l.side_effect = OperationFailure('some failure') + with ArcticTransaction(library, symbol, 'u1', 'l2') as mt: + mt.write(symbol, ts3, metadata={'a': 1, 'b': 2}) + + res = library.read(symbol) + # Corrupted data still increments to next version correctly with ts & metadata unchanged + assert res.version == 4 + assert_frame_equal(ts3, library.read(symbol, 4).data) + assert res.metadata == {'a': 1, 'b': 2} + + +def test_write_after_delete(library): + with ArcticTransaction(library, symbol, 'u1', 'l') as mt: + mt.write(symbol, ts1) + library.delete(symbol) + + with ArcticTransaction(library, symbol, 'u1', 'l') as mt: + mt.write(symbol, ts1_append) + assert_frame_equal(library.read(symbol).data, ts1_append) + + +def test_ArcticTransaction_write_skips_for_exact_match(library): + ts = read_str_as_pandas("""times | PX_LAST + 2014-10-31 21:30:00.000 | 204324.674 + 2014-11-13 21:30:00.000 | 193964.45 + 2014-11-14 21:30:00.000 | 193650.403""") + + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, ts) + + version = library.read(symbol).version + + # try and store same TimeSeries again + with ArcticTransaction(library, symbol, 'u1', 'l2') as mt: + mt.write(symbol, ts) + + assert library.read(symbol).version == version + + +def test_ArcticTransaction_write_doesnt_skip_for_close_ts(library): + orig_ts = read_str_as_pandas("""times | PX_LAST + 2014-10-31 21:30:00.000 | 204324.674 + 2014-11-13 21:30:00.000 | 193964.45 + 2014-11-14 21:30:00.000 | 193650.403""") + + with ArcticTransaction(library, symbol, 'u1', 'l1') as mt: + mt.write(symbol, orig_ts) + + assert_frame_equal(library.read(symbol).data, orig_ts) + + # try and store slighty different TimeSeries + new_ts = read_str_as_pandas("""times | PX_LAST + 2014-10-31 21:30:00.000 | 204324.672 + 2014-11-13 21:30:00.000 | 193964.453 + 2014-11-14 21:30:00.000 | 193650.406""") + + with ArcticTransaction(library, symbol, 'u1', 'l2') as mt: + mt.write(symbol, new_ts) + + assert_frame_equal(library.read(symbol).data, new_ts) diff --git a/tests/integration/test_arctic.py b/tests/integration/test_arctic.py new file mode 100644 index 000000000..46a8fc4f7 --- /dev/null +++ b/tests/integration/test_arctic.py @@ -0,0 +1,146 @@ +from datetime import datetime as dt, timedelta as dtd +from mock import patch +from pandas import DataFrame +from pandas.util.testing import assert_frame_equal +import pytest +import time +import numpy as np + +from arctic.arctic import Arctic, VERSION_STORE +from arctic.exceptions import LibraryNotFoundException, QuotaExceededException + +from ..util import get_large_ts + + +def test_connect_to_Arctic_string(mongo_host): + arctic = Arctic(mongo_host=mongo_host) + assert arctic.list_libraries() == [] + assert arctic.mongo_host == mongo_host + + +def test_connect_to_Arctic_connection(mongodb, mongo_host): + arctic = Arctic(mongodb) + assert arctic.list_libraries() == [] + assert arctic.mongo_host == mongo_host + + +def test_simple(library): + sym = 'symbol' + data = get_large_ts(100) + + library.write(sym, data) + orig = dt.now() + time.sleep(1) # Move the timestamp on 1ms + data2 = get_large_ts(100) + library.write(sym, data2, prune_previous_version=False) + + # Get the timeseries, it should be the same + read2 = library.read(sym).data + assert_frame_equal(read2, data2) + + # Ensure we can get the previous version + read = library.read(sym, as_of=orig).data + assert_frame_equal(read, data) + + +def test_indexes(arctic): + c = arctic._conn + arctic.initialize_library("library", VERSION_STORE, segment='month') + chunk = c.arctic.library.index_information() + assert chunk == {u'_id_': {u'key': [(u'_id', 1)], u'ns': u'arctic.library', u'v': 1}, + u'symbol_1_parent_1_segment_1': {u'background': True, + u'key': [(u'symbol', 1), + (u'parent', 1), + (u'segment', 1)], + u'ns': u'arctic.library', + u'unique': True, + u'v': 1}, + u'symbol_1_sha_1': {u'background': True, + u'key': [(u'symbol', 1), (u'sha', 1)], + u'ns': u'arctic.library', + u'unique': True, + u'v': 1}, + u'symbol_hashed': {u'background': True, + u'key': [(u'symbol', u'hashed')], + u'ns': u'arctic.library', + u'v': 1}} + snapshots = c.arctic.library.snapshots.index_information() + assert snapshots == {u'_id_': {u'key': [(u'_id', 1)], + u'ns': u'arctic.library.snapshots', + u'v': 1}, + u'name_1': {u'background': True, + u'key': [(u'name', 1)], + u'ns': u'arctic.library.snapshots', + u'unique': True, + u'v': 1}} + versions = c.arctic.library.versions.index_information() + assert versions == {u'_id_': {u'key': [(u'_id', 1)], + u'ns': u'arctic.library.versions', + u'v': 1}, + u'symbol_1__id_-1': {u'background': True, + u'key': [(u'symbol', 1), (u'_id', -1)], + u'ns': u'arctic.library.versions', + u'v': 1}, + u'symbol_1_version_-1': {u'background': True, + u'key': [(u'symbol', 1), (u'version', -1)], + u'ns': u'arctic.library.versions', + u'unique': True, + u'v': 1}} + version_nums = c.arctic.library.version_nums.index_information() + assert version_nums == {u'_id_': {u'key': [(u'_id', 1)], + u'ns': u'arctic.library.version_nums', + u'v': 1}, + u'symbol_1': {u'background': True, + u'key': [(u'symbol', 1)], + u'ns': u'arctic.library.version_nums', + u'unique': True, + u'v': 1}} + + +def test_delete_library(arctic, library, library_name): + mongo = arctic._conn + # create a library2 library too - ensure that this isn't deleted + arctic.initialize_library('user.library2', VERSION_STORE, segment='month') + library.write('asdf', get_large_ts(1)) + assert 'TEST' in mongo.arctic_test.collection_names() + assert 'TEST.versions' in mongo.arctic_test.collection_names() + assert 'library2' in mongo.arctic_user.collection_names() + assert 'library2.versions' in mongo.arctic_user.collection_names() + + arctic.delete_library(library_name) + assert 'TEST' not in mongo.arctic_user.collection_names() + assert 'TEST.versions' not in mongo.arctic_user.collection_names() + with pytest.raises(LibraryNotFoundException): + arctic[library_name] + with pytest.raises(LibraryNotFoundException): + arctic['arctic_{}'.format(library_name)] + assert 'library2' in mongo.arctic_user.collection_names() + assert 'library2.versions' in mongo.arctic_user.collection_names() + + +def test_quota(arctic, library, library_name): + thing = list(range(100)) + library._arctic_lib.set_quota(10) + assert arctic.get_quota(library_name) == 10 + assert library._arctic_lib.get_quota() == 10 + library.write('thing', thing) + with pytest.raises(QuotaExceededException): + library.write('ts', thing) + library.write('ts', thing) + library.write('ts', thing) + library.write('ts', thing) + with pytest.raises(QuotaExceededException): + arctic.check_quota(library_name) + + +def test_check_quota(arctic, library, library_name): + with patch('arctic.logging.logger.info') as info: + arctic.check_quota(library_name) + assert info.call_count == 1 + + +def test_default_mongo_retry_timout(): + now = time.time() + with pytest.raises(LibraryNotFoundException): + Arctic('unresolved-host', serverSelectionTimeoutMS=0)['some.lib'] + assert time.time() - now < 1. diff --git a/tests/integration/test_compress_integration.py b/tests/integration/test_compress_integration.py new file mode 100644 index 000000000..f084506ba --- /dev/null +++ b/tests/integration/test_compress_integration.py @@ -0,0 +1,35 @@ +import random +import lz4 +import string +import pytest +from datetime import datetime as dt + +import arctic._compress as c + + +@pytest.mark.parametrize("n, length", [(300, 5e4), # micro TS + (5, 2e6), # Futures TS + (10, 2e6), # Futures TS + (100, 2e6), # Large TS + (250, 2e6)]) # Even Bigger TS +def test_performance_sequential(n, length): + _str = random_string(length) + _strarr = [_str for _ in range(n)] + now = dt.now() + [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] + clz4_time = (dt.now() - now).total_seconds() + now = dt.now() + c.decompressarr(c.compressarrHC(_strarr)) + clz4_time_p = (dt.now() - now).total_seconds() + now = dt.now() + [lz4.decompress(y) for y in [lz4.compressHC(x) for x in _strarr]] + lz4_time = (dt.now() - now).total_seconds() + print + print "LZ4 Test %sx len:%s" % (n, length) + print " Cython LZ4 %s s" % clz4_time + print " Cython LZ4 Parallel %s s" % clz4_time_p + print " LZ4 %s s" % lz4_time + + +def random_string(N): + return ''.join(random.choice(list(string.printable) + ['hello', 'world', 'hellworld', 'Hello', 'w0rld']) for _ in xrange(int(N))) diff --git a/tests/integration/test_decorators.py b/tests/integration/test_decorators.py new file mode 100644 index 000000000..fd3e3f487 --- /dev/null +++ b/tests/integration/test_decorators.py @@ -0,0 +1,7 @@ +from arctic.decorators import _get_host + + +def test_get_host_VersionStore(library, mongo_host): + assert _get_host(library) == {'mnodes': [mongo_host], + 'mhost': mongo_host, + 'l': u'arctic_test.TEST'} diff --git a/tests/integration/test_howtos.py b/tests/integration/test_howtos.py new file mode 100644 index 000000000..59477c464 --- /dev/null +++ b/tests/integration/test_howtos.py @@ -0,0 +1,13 @@ +import glob +import fcntl +import os +import pytest +import subprocess + +HOWTO_DIR = os.path.realpath(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'howtos')) + + +@pytest.mark.parametrize('howto', sorted([x.split('/')[-1] + for x in glob.glob(os.path.join(HOWTO_DIR, 'how_to_*.py'))])) +def test_howto(howto, mongo_host): + execfile(HOWTO_DIR + "/" + howto, {'mongo_host': mongo_host}) diff --git a/tests/integration/tickstore/__init__.py b/tests/integration/tickstore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/tickstore/conftest.py b/tests/integration/tickstore/conftest.py new file mode 100644 index 000000000..cfc4a8edc --- /dev/null +++ b/tests/integration/tickstore/conftest.py @@ -0,0 +1,23 @@ +import pytest + +from arctic.tickstore import toplevel +from arctic.tickstore import tickstore + + +def pytest_generate_tests(metafunc): + if 'tickstore_lib' in metafunc.fixturenames: + metafunc.parametrize("tickstore_lib", ['tickstore'], indirect=True) + + +@pytest.fixture(scope='function') +def tickstore_lib(arctic, request): + if request.param == "tickstore": + store = tickstore + arctic.initialize_library('test.tickstore', store.TICK_STORE_TYPE) + return arctic['test.tickstore'] + + +@pytest.fixture(scope='function') +def toplevel_tickstore(arctic): + arctic.initialize_library('test.toplevel_tickstore', toplevel.TICK_STORE_TYPE) + return arctic['test.toplevel_tickstore'] diff --git a/tests/integration/tickstore/test_toplevel.py b/tests/integration/tickstore/test_toplevel.py new file mode 100644 index 000000000..b66703dea --- /dev/null +++ b/tests/integration/tickstore/test_toplevel.py @@ -0,0 +1,170 @@ +from datetime import datetime as dt, timedelta as dtd +from dateutil.rrule import rrule, DAILY +import pytest +import pandas as pd +from pandas.util.testing import assert_frame_equal +import numpy as np + +from arctic.date import DateRange, mktz +from arctic.tickstore import toplevel +from arctic.tickstore import tickstore +from arctic.exceptions import NoDataFoundException, LibraryNotFoundException, OverlappingDataException + + +FEED_2010_LEVEL1 = toplevel.TickStoreLibrary('FEED_2010.LEVEL1', DateRange(dt(2010, 1, 1), dt(2010, 12, 31, 23, 59, 59))) +FEED_2011_LEVEL1 = toplevel.TickStoreLibrary('FEED_2011.LEVEL1', DateRange(dt(2011, 1, 1), dt(2011, 12, 31, 23, 59, 59))) +FEED_2012_LEVEL1 = toplevel.TickStoreLibrary('FEED_2012.LEVEL1', DateRange(dt(2012, 1, 1), dt(2012, 12, 31, 23, 59, 59))) + +@pytest.mark.parametrize(('start', 'end', 'expected'), + [(dt(2010, 2, 1), dt(2010, 4, 1), [FEED_2010_LEVEL1]), + (dt(2011, 2, 1), dt(2011, 4, 1), [FEED_2011_LEVEL1]), + (dt(2010, 2, 1), dt(2011, 4, 1), [FEED_2010_LEVEL1, FEED_2011_LEVEL1]), + (dt(2011, 2, 1), dt(2012, 4, 1), [FEED_2011_LEVEL1, FEED_2012_LEVEL1]), + (dt(2010, 2, 1), dt(2012, 4, 1), [FEED_2010_LEVEL1, FEED_2011_LEVEL1, FEED_2012_LEVEL1]), + (dt(2009, 2, 1), dt(2010, 12, 31), [FEED_2010_LEVEL1]), + (dt(2012, 2, 1), dt(2013, 12, 31), [FEED_2012_LEVEL1]), + (dt(2009, 2, 1), dt(2009, 12, 31), []), + (dt(2013, 2, 1), dt(2013, 12, 31), []), + ]) +def should_return_libraries_for_the_given_daterange(toplevel_tickstore, start, end, expected): + toplevel_tickstore._collection.insert_one({'start': dt(2010, 1, 1), + 'end': dt(2010, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2010.LEVEL1'}) + toplevel_tickstore._collection.insert_one({'start': dt(2011, 1, 1), + 'end': dt(2011, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2011.LEVEL1'}) + toplevel_tickstore._collection.insert_one({'start': dt(2012, 1, 1), + 'end': dt(2012, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2012.LEVEL1'}) + libraries = toplevel_tickstore._get_library_metadata(DateRange(start=start, end=end)) + assert libraries == expected + + +def should_raise_exceptions_if_no_libraries_are_found_in_the_date_range_when_reading_data(toplevel_tickstore): + toplevel_tickstore._collection.insert_one({'start': dt(2010, 1, 1), + 'end': dt(2010, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2010.LEVEL1'}) + with pytest.raises(NoDataFoundException) as e: + toplevel_tickstore.read('blah', DateRange(start=dt(2012, 1, 1), end=dt(2012, 3, 1))) + assert "No underlying libraries exist for the given date range" in str(e) + + +def should_return_data_when_date_range_falls_in_a_single_underlying_library(toplevel_tickstore, arctic): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + tickstore = arctic['FEED_2010.LEVEL1'] + arctic.initialize_library('test_current.toplevel_tickstore', tickstore.TICK_STORE_TYPE) + tickstore_current = arctic['test_current.toplevel_tickstore'] + toplevel_tickstore._collection.insert_one({'start': dt(2010, 1, 1), + 'end': dt(2010, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2010.LEVEL1'}) + dates = pd.date_range('20100101', periods=6, tz=mktz('Europe/London')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + tickstore.write('blah', df) + tickstore_current.write('blah', df) + res = toplevel_tickstore.read('blah', DateRange(start=dt(2010, 1, 1), end=dt(2010, 1, 6)), list('ABCD')) + + assert_frame_equal(df, res.tz_localize(mktz('Europe/London'))) + + +def should_return_data_when_date_range_spans_libraries(toplevel_tickstore, arctic): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('FEED_2011.LEVEL1', tickstore.TICK_STORE_TYPE) + tickstore_2010 = arctic['FEED_2010.LEVEL1'] + tickstore_2011 = arctic['FEED_2011.LEVEL1'] + toplevel_tickstore._collection.insert_one({'start': dt(2010, 1, 1), + 'end': dt(2010, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2010.LEVEL1'}) + toplevel_tickstore._collection.insert_one({'start': dt(2011, 1, 1), + 'end': dt(2011, 12, 31, 23, 59, 59), + 'library_name': 'FEED_2011.LEVEL1'}) + dates = pd.date_range('20100101', periods=6, tz=mktz('Europe/London')) + df_10 = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + tickstore_2010.write('blah', df_10) + dates = pd.date_range('20110101', periods=6, tz=mktz('Europe/London')) + df_11 = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + tickstore_2011.write('blah', df_11) + res = toplevel_tickstore.read('blah', DateRange(start=dt(2010, 1, 2), end=dt(2011, 1, 4)), list('ABCD')) + expected_df = pd.concat([df_10[1:], df_11[:4]]) + assert_frame_equal(expected_df, res.tz_localize(mktz('Europe/London'))) + + +def should_add_underlying_library_where_none_exists(toplevel_tickstore, arctic): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + toplevel_tickstore.add(DateRange(start=dt(2010, 1, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010.LEVEL1') + assert toplevel_tickstore._collection.find_one({'library_name': 'FEED_2010.LEVEL1'}) + + +def should_add_underlying_library_where_another_library_exists_in_a_non_overlapping_daterange(toplevel_tickstore, arctic): + toplevel_tickstore._collection.insert_one({'library_name': 'FEED_2011.LEVEL1', 'start': dt(2011, 1, 1), 'end': dt(2011, 12, 31)}) + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + toplevel_tickstore.add(DateRange(start=dt(2010, 1, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010.LEVEL1') + assert set([ res['library_name'] for res in toplevel_tickstore._collection.find()]) == set(['FEED_2010.LEVEL1', 'FEED_2011.LEVEL1']) + + +def should_raise_exception_if_library_does_not_exist(toplevel_tickstore): + with pytest.raises(LibraryNotFoundException) as e: + toplevel_tickstore.add(DateRange(start=dt(2010, 1, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010.LEVEL1') + assert toplevel_tickstore._collection.find_one({'library_name': 'FEED_2010.LEVEL1'}) + assert "Library FEED_2010.LEVEL1 was not correctly initialized" in str(e) + + +def should_raise_exception_if_date_range_for_library_overlaps_with_existing_libraries(toplevel_tickstore, arctic): + toplevel_tickstore._collection.insert_one({'library_name': 'FEED_2010.LEVEL1', 'start': dt(2010, 1, 1), 'end': dt(2010, 6, 30)}) + arctic.initialize_library('FEED_2010a.LEVEL1', tickstore.TICK_STORE_TYPE) + with pytest.raises(OverlappingDataException) as e: + toplevel_tickstore.add(DateRange(start=dt(2010, 6, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010a.LEVEL1') + assert toplevel_tickstore._collection.find_one({'library_name': 'FEED_2010.LEVEL1'}) + assert "There are libraries that overlap with the date range:" in str(e) + + +def should_successfully_do_a_roundtrip_write_and_read_spanning_multiple_underlying_libraries(toplevel_tickstore, arctic): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('FEED_2011.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('test_current.toplevel_tickstore', tickstore.TICK_STORE_TYPE) + toplevel_tickstore.add(DateRange(start=dt(2010, 1, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010.LEVEL1') + toplevel_tickstore.add(DateRange(start=dt(2011, 1, 1), end=dt(2011, 12, 31, 23, 59, 59, 999000)), 'FEED_2011.LEVEL1') + tickstore_current = arctic['test_current.toplevel_tickstore'] + dates = pd.date_range('20101201', periods=57, tz=mktz('Europe/London')) + data = pd.DataFrame(np.random.randn(57, 4), index=dates, columns=list('ABCD')) + toplevel_tickstore.write('blah', data) + tickstore_current.write('blah', data) + res = toplevel_tickstore.read('blah', DateRange(start=dt(2010, 12, 1), end=dt(2011, 2, 1)), columns=list('ABCD')) + assert_frame_equal(data, res.tz_localize(mktz('Europe/London'))) + lib2010 = arctic['FEED_2010.LEVEL1'] + res = lib2010.read('blah', DateRange(start=dt(2010, 12, 1), end=dt(2011, 1, 1)), columns=list('ABCD')) + assert_frame_equal(data[dt(2010, 12, 1): dt(2010, 12, 31)], res.tz_localize(mktz('Europe/London'))) + lib2011 = arctic['FEED_2011.LEVEL1'] + res = lib2011.read('blah', DateRange(start=dt(2011, 1, 1), end=dt(2011, 2, 1)), columns=list('ABCD')) + assert_frame_equal(data[dt(2011, 1, 1): dt(2011, 2, 1)], res.tz_localize(mktz('Europe/London'))) + + +@pytest.mark.parametrize(('start', 'end', 'startr', 'endr'), + [(dt(2010, 1, 1), dt(2011, 12, 31), 0, 10), + (dt(2010, 1, 1), dt(2010, 12, 31), 0, 8), + (dt(2011, 1, 1), dt(2011, 12, 31), 7, 10), + ]) +def should_list_symbols_from_the_underlying_library(toplevel_tickstore, arctic, start, end, startr, endr): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('FEED_2011.LEVEL1', tickstore.TICK_STORE_TYPE) + toplevel_tickstore.add(DateRange(start=dt(2010, 1, 1), end=dt(2010, 12, 31, 23, 59, 59, 999000)), 'FEED_2010.LEVEL1') + toplevel_tickstore.add(DateRange(start=dt(2011, 1, 1), end=dt(2011, 12, 31, 23, 59, 59, 999000)), 'FEED_2011.LEVEL1') + dtstart = dt(2010, 1, 1, tzinfo=mktz('Europe/London')) + for i in range(10): + dates = pd.date_range(dtstart, periods=50, tz=mktz('Europe/London')) + df = pd.DataFrame(np.random.randn(50, 4), index=dates, columns=list('ABCD')) + dtstart = dates[-1] + dtd(days=1) + toplevel_tickstore.write('sym' + str(i), df) + expected_symbols = ['sym' + str(i) for i in range(startr, endr)] + assert expected_symbols == toplevel_tickstore.list_symbols(DateRange(start=start, end=end)) + + +def should_add_underlying_libraries_when_intialized(arctic): + arctic.initialize_library('FEED_2010.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('FEED_2011.LEVEL1', tickstore.TICK_STORE_TYPE) + arctic.initialize_library('FEED.LEVEL1', toplevel.TICK_STORE_TYPE) + toplevel_tickstore = arctic['FEED.LEVEL1'] + cur = toplevel_tickstore._collection.find(projection={'_id': 0}) + results = {result['library_name']: {'start': result['start'], 'end': result['end']} for result in cur} + expected_results = {'FEED_2010.LEVEL1': {'start': dt(2010, 1, 1), 'end': dt(2010, 12, 31, 23, 59, 59, 999000)}, + 'FEED_2011.LEVEL1': {'start': dt(2011, 1, 1), 'end': dt(2011, 12, 31, 23, 59, 59, 999000)}} + assert expected_results == results diff --git a/tests/integration/tickstore/test_ts_delete.py b/tests/integration/tickstore/test_ts_delete.py new file mode 100644 index 000000000..dd8ecc5f9 --- /dev/null +++ b/tests/integration/tickstore/test_ts_delete.py @@ -0,0 +1,54 @@ +from datetime import datetime as dt +from mock import patch +import numpy as np +from pandas.util.testing import assert_frame_equal +import pytest + +from arctic import arctic as m +from arctic.date import DateRange, CLOSED_OPEN, mktz +from arctic.exceptions import OverlappingDataException, \ + NoDataFoundException + + +def test_delete(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 1, 30, tzinfo=mktz('Europe/London')) + }, + ] + tickstore_lib.chunk_size = 1 + tickstore_lib.write('SYM', DUMMY_DATA) + tickstore_lib.delete('SYM') + with pytest.raises(NoDataFoundException): + tickstore_lib.read('SYM', date_range=DateRange(20130102), columns=None) + + # Delete with a date-range + tickstore_lib.write('SYM', DUMMY_DATA) + tickstore_lib.delete('SYM', DateRange(dt(2013, 1, 1, tzinfo=mktz('Europe/London')), dt(2013, 1, 2, tzinfo=mktz('Europe/London')))) + df = tickstore_lib.read('SYM', columns=None) + assert np.allclose(df['b'].values, np.array([4.])) + + +def test_delete_daterange(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 2, 1, tzinfo=mktz('Europe/London')) + }, + ] + tickstore_lib.chunk_size = 1 + tickstore_lib.write('SYM', DUMMY_DATA) + + # Delete with a date-range + tickstore_lib.delete('SYM', DateRange(dt(2013, 1, 1, tzinfo=mktz('Europe/London')), dt(2013, 2, 1, tzinfo=mktz('Europe/London')), CLOSED_OPEN)) + df = tickstore_lib.read('SYM', columns=None) + assert np.allclose(df['b'].values, np.array([4.])) diff --git a/tests/integration/tickstore/test_ts_read.py b/tests/integration/tickstore/test_ts_read.py new file mode 100644 index 000000000..e16dc30f8 --- /dev/null +++ b/tests/integration/tickstore/test_ts_read.py @@ -0,0 +1,391 @@ +from datetime import datetime as dt +from mock import patch +import numpy as np +from numpy.testing.utils import assert_array_equal +from pandas.util.testing import assert_frame_equal +import pandas as pd +import pytest +import pytz + +from arctic import arctic as m +from arctic.date import DateRange, mktz, CLOSED_CLOSED, CLOSED_OPEN, OPEN_CLOSED, OPEN_OPEN +from arctic.exceptions import OverlappingDataException, NoDataFoundException + + +def test_read(tickstore_lib): + data = [{'ASK': 1545.25, + 'ASKSIZE': 1002.0, + 'BID': 1545.0, + 'BIDSIZE': 55.0, + 'CUMVOL': 2187387.0, + 'DELETED_TIME': 0, + 'INSTRTYPE': 'FUT', + 'PRICE': 1545.0, + 'SIZE': 1.0, + 'TICK_STATUS': 0, + 'TRADEHIGH': 1561.75, + 'TRADELOW': 1537.25, + 'index': 1185076787070}, + {'CUMVOL': 354.0, + 'DELETED_TIME': 0, + 'PRICE': 1543.75, + 'SIZE': 354.0, + 'TRADEHIGH': 1543.75, + 'TRADELOW': 1543.75, + 'index': 1185141600600}] + tickstore_lib.write('FEED::SYMBOL', data) + + df = tickstore_lib.read('FEED::SYMBOL', columns=['BID', 'ASK', 'PRICE']) + + assert_array_equal(df['ASK'].values, np.array([1545.25, np.nan])) + assert_array_equal(df['BID'].values, np.array([1545, np.nan])) + assert_array_equal(df['PRICE'].values, np.array([1545, 1543.75])) + assert_array_equal(df.index.values, np.array(['2007-07-22T04:59:47.070000000+0100', + '2007-07-22T23:00:00.600000000+0100'], dtype='datetime64[ns]')) + assert tickstore_lib._collection.find_one()['c'] == 2 + + +def test_read_symbol_as_column(tickstore_lib): + data = [{'ASK': 1545.25, + 'index': 1185076787070}, + {'CUMVOL': 354.0, + 'index': 1185141600600}] + tickstore_lib.write('FEED::SYMBOL', data) + + df = tickstore_lib.read('FEED::SYMBOL', columns=['SYMBOL']) + assert all(df['SYMBOL'].values == ['FEED::SYMBOL']) + + +def test_read_multiple_symbols(tickstore_lib): + data1 = [{'ASK': 1545.25, + 'ASKSIZE': 1002.0, + 'BID': 1545.0, + 'BIDSIZE': 55.0, + 'CUMVOL': 2187387.0, + 'DELETED_TIME': 0, + 'INSTRTYPE': 'FUT', + 'PRICE': 1545.0, + 'SIZE': 1.0, + 'TICK_STATUS': 0, + 'TRADEHIGH': 1561.75, + 'TRADELOW': 1537.25, + 'index': 1185076787070}, ] + data2 = [{'CUMVOL': 354.0, + 'DELETED_TIME': 0, + 'PRICE': 1543.75, + 'SIZE': 354.0, + 'TRADEHIGH': 1543.75, + 'TRADELOW': 1543.75, + 'index': 1185141600600}] + + tickstore_lib.write('BAR', data2) + tickstore_lib.write('FOO', data1) + + df = tickstore_lib.read(['FOO', 'BAR'], columns=['BID', 'ASK', 'PRICE']) + + assert all(df['SYMBOL'].values == ['FOO', 'BAR']) + assert_array_equal(df['ASK'].values, np.array([1545.25, np.nan])) + assert_array_equal(df['BID'].values, np.array([1545, np.nan])) + assert_array_equal(df['PRICE'].values, np.array([1545, 1543.75])) + assert_array_equal(df.index.values, np.array(['2007-07-22T04:59:47.070000000+0100', + '2007-07-22T23:00:00.600000000+0100'], dtype='datetime64[ns]')) + assert tickstore_lib._collection.find_one()['c'] == 1 + + + +@pytest.mark.parametrize('chunk_size', [1, 100]) +def test_read_all_cols_all_dtypes(tickstore_lib, chunk_size): + data = [{'f': 0.1, + 'of': 0.2, + 's': 's', + 'os': 'os', + 'l': 1, + 'ol': 2, + 'index': dt(1970, 1, 1, tzinfo=mktz('UTC')), + }, + {'f': 0.3, + 'nf': 0.4, + 's': 't', + 'ns': 'ns', + 'l': 3, + 'nl': 4, + 'index': dt(1970, 1, 1, 0, 0, 1, tzinfo=mktz('UTC')), + }, + ] + tickstore_lib.chunk_size = 3 + tickstore_lib.write('sym', data) + df = tickstore_lib.read('sym', columns=None) + + # The below is probably more trouble than it's worth, but we *should* + # be able to roundtrip data and get the same answer... + + # Ints become floats + data[0]['l'] = float(data[0]['l']) + # Treat missing strings as None + data[0]['ns'] = None + data[1]['os'] = None + # Strip TZ from the data for the moment + data[0]['index'] = dt(1970, 1, 1) + data[1]['index'] = dt(1970, 1, 1, 0, 0, 1) + expected = pd.DataFrame(data) + expected = expected.set_index('index') + expected = expected[df.columns] + assert_frame_equal(expected, df, check_names=False) + + +DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'b': 3., + 'c': 4., + 'index': dt(2013, 1, 2, tzinfo=mktz('Europe/London')) + }, + {'b': 5., + 'c': 6., + 'index': dt(2013, 1, 3, tzinfo=mktz('Europe/London')) + }, + {'b': 7., + 'c': 8., + 'index': dt(2013, 1, 4, tzinfo=mktz('Europe/London')) + }, + {'b': 9., + 'c': 10., + 'index': dt(2013, 1, 5, tzinfo=mktz('Europe/London')) + }, + ] + + +def test_date_range(tickstore_lib): + tickstore_lib.write('SYM', DUMMY_DATA) + df = tickstore_lib.read('SYM', date_range=DateRange(20130101, 20130103), columns=None) + assert_array_equal(df['a'].values, np.array([1, np.nan, np.nan])) + assert_array_equal(df['b'].values, np.array([2., 3., 5.])) + assert_array_equal(df['c'].values, np.array([np.nan, 4., 6.])) + + tickstore_lib.delete('SYM') + + # Chunk every 3 symbols and lets have some fun + tickstore_lib.chunk_size = 3 + tickstore_lib.write('SYM', DUMMY_DATA) + + with patch.object(tickstore_lib._collection, 'find', side_effect=tickstore_lib._collection.find) as f: + df = tickstore_lib.read('SYM', date_range=DateRange(20130101, 20130103), columns=None) + assert_array_equal(df['b'].values, np.array([2., 3., 5.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1 + df = tickstore_lib.read('SYM', date_range=DateRange(20130102, 20130103), columns=None) + assert_array_equal(df['b'].values, np.array([3., 5.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1 + df = tickstore_lib.read('SYM', date_range=DateRange(20130103, 20130103), columns=None) + assert_array_equal(df['b'].values, np.array([5.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1 + + df = tickstore_lib.read('SYM', date_range=DateRange(20130102, 20130104), columns=None) + assert_array_equal(df['b'].values, np.array([3., 5., 7.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 2 + df = tickstore_lib.read('SYM', date_range=DateRange(20130102, 20130105), columns=None) + assert_array_equal(df['b'].values, np.array([3., 5., 7., 9.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 2 + + df = tickstore_lib.read('SYM', date_range=DateRange(20130103, 20130104), columns=None) + assert_array_equal(df['b'].values, np.array([5., 7.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 2 + df = tickstore_lib.read('SYM', date_range=DateRange(20130103, 20130105), columns=None) + assert_array_equal(df['b'].values, np.array([5., 7., 9.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 2 + + df = tickstore_lib.read('SYM', date_range=DateRange(20130104, 20130105), columns=None) + assert_array_equal(df['b'].values, np.array([7., 9.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1 + + # Test the different open-closed behaviours + df = tickstore_lib.read('SYM', date_range=DateRange(20130104, 20130105, CLOSED_CLOSED), columns=None) + assert_array_equal(df['b'].values, np.array([7., 9.])) + df = tickstore_lib.read('SYM', date_range=DateRange(20130104, 20130105, CLOSED_OPEN), columns=None) + assert_array_equal(df['b'].values, np.array([7.])) + df = tickstore_lib.read('SYM', date_range=DateRange(20130104, 20130105, OPEN_CLOSED), columns=None) + assert_array_equal(df['b'].values, np.array([9.])) + df = tickstore_lib.read('SYM', date_range=DateRange(20130104, 20130105, OPEN_OPEN), columns=None) + assert_array_equal(df['b'].values, np.array([])) + + +def test_date_range_end_not_in_range(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'b': 3., + 'c': 4., + 'index': dt(2013, 1, 2, 10, 1, tzinfo=mktz('Europe/London')) + }, + ] + + tickstore_lib.chunk_size = 1 + tickstore_lib.write('SYM', DUMMY_DATA) + with patch.object(tickstore_lib._collection, 'find', side_effect=tickstore_lib._collection.find) as f: + df = tickstore_lib.read('SYM', date_range=DateRange(20130101, dt(2013, 1, 2, 9, 0)), columns=None) + assert_array_equal(df['b'].values, np.array([2.])) + assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1 + + +def test_date_range_no_bounds(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 1, 30, tzinfo=mktz('Europe/London')) + }, + {'b': 5., + 'c': 6., + 'index': dt(2013, 2, 2, 10, 1, tzinfo=mktz('Europe/London')) + }, + ] + + tickstore_lib.chunk_size = 1 + tickstore_lib.write('SYM', DUMMY_DATA) + + # 1) No start, no end + df = tickstore_lib.read('SYM', columns=None) + assert_array_equal(df['b'].values, np.array([2., 4.])) + # 1.2) Start before the real start + df = tickstore_lib.read('SYM', date_range=DateRange(20121231), columns=None) + assert_array_equal(df['b'].values, np.array([2., 4.])) + # 2.1) Only go one month out + df = tickstore_lib.read('SYM', date_range=DateRange(20130101), columns=None) + assert_array_equal(df['b'].values, np.array([2., 4.])) + # 2.2) Only go one month out + df = tickstore_lib.read('SYM', date_range=DateRange(20130102), columns=None) + assert_array_equal(df['b'].values, np.array([4.])) + # 3) No start + df = tickstore_lib.read('SYM', date_range=DateRange(end=20130102), columns=None) + assert_array_equal(df['b'].values, np.array([2.])) + # 4) Outside bounds + df = tickstore_lib.read('SYM', date_range=DateRange(end=20131212), columns=None) + assert_array_equal(df['b'].values, np.array([2., 4., 5.])) + + +def test_date_range_BST(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 6, 1, 12, 00, tzinfo=mktz('Europe/London')) + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('Europe/London')) + }, + ] + tickstore_lib.chunk_size = 1 + tickstore_lib.write('SYM', DUMMY_DATA) + + df = tickstore_lib.read('SYM', columns=None) + assert_array_equal(df['b'].values, np.array([2., 4.])) + +# df = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, 12), +# dt(2013, 6, 1, 13))) +# assert_array_equal(df['b'].values, np.array([2., 4.])) + df = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, 12, tzinfo=mktz('Europe/London')), + dt(2013, 6, 1, 13, tzinfo=mktz('Europe/London')))) + assert_array_equal(df['b'].values, np.array([2., 4.])) + + df = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, 12, tzinfo=mktz('UTC')), + dt(2013, 6, 1, 13, tzinfo=mktz('UTC')))) + assert_array_equal(df['b'].values, np.array([4., ])) + + +def test_read_no_data(tickstore_lib): + with pytest.raises(NoDataFoundException): + tickstore_lib.read('missing_sym', DateRange(20131212, 20131212)) + + +def test_write_no_tz(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 6, 1, 12, 00) + }] + with pytest.raises(ValueError): + tickstore_lib.write('SYM', DUMMY_DATA) + + +def test_read_out_of_order(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 6, 1, 12, 00, tzinfo=mktz('UTC')) + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 6, 1, 11, 00, tzinfo=mktz('UTC')) # Out-of-order + }, + {'a': 3., + 'b': 4., + 'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('UTC')) + }, + ] + tickstore_lib.chunk_size = 3 + tickstore_lib.write('SYM', DUMMY_DATA) + tickstore_lib.read('SYM', columns=None) + assert len(tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, tzinfo=mktz('UTC')), dt(2013, 6, 2, tzinfo=mktz('UTC'))))) == 3 + assert len(tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, tzinfo=mktz('UTC')), dt(2013, 6, 1, 12, tzinfo=mktz('UTC'))))) == 2 + + +def test_read_longs(tickstore_lib): + DUMMY_DATA = [ + {'a': 1, + 'index': dt(2013, 6, 1, 12, 00, tzinfo=mktz('Europe/London')) + }, + { + 'b': 4, + 'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('Europe/London')) + }, + ] + tickstore_lib.chunk_size = 3 + tickstore_lib.write('SYM', DUMMY_DATA) + tickstore_lib.read('SYM', columns=None) + read = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1), dt(2013, 6, 2))) + assert read['a'][0] == 1 + assert np.isnan(read['b'][0]) + + +def test_read_with_image(tickstore_lib): + DUMMY_DATA = [ + {'a': 1., + 'index': dt(2013, 6, 1, 12, 00, tzinfo=mktz('Europe/London')) + }, + { + 'b': 4., + 'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('Europe/London')) + }, + ] + # Add an image + tickstore_lib.write('SYM', DUMMY_DATA) + tickstore_lib._collection.update_one({}, + {'$set': + {'im': {'i': + {'a': 37., + 'c': 2., + }, + 't': dt(2013, 6, 1, 11, 0) + } + } + } + ) + + tickstore_lib.read('SYM', columns=None) + read = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1), dt(2013, 6, 2))) + assert read['a'][0] == 1 + + # Read with the image as well + read = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1), dt(2013, 6, 2)), + include_images=True) + assert read['a'][0] == 37 + assert read['a'][1] == 1 + assert np.isnan(read['b'][0]) + assert read['b'][2] == 4 + assert read.index[0] == dt(2013, 6, 1, 11) diff --git a/tests/integration/tickstore/test_ts_write.py b/tests/integration/tickstore/test_ts_write.py new file mode 100644 index 000000000..5484f2d42 --- /dev/null +++ b/tests/integration/tickstore/test_ts_write.py @@ -0,0 +1,77 @@ +from datetime import datetime as dt +from mock import patch +import numpy as np +from pandas.util.testing import assert_frame_equal +import pytest + +from arctic import arctic as m +from arctic.date import mktz +from arctic.exceptions import OverlappingDataException, \ + NoDataFoundException + + +DUMMY_DATA = [ + {'a': 1., + 'b': 2., + 'index': dt(2013, 1, 1, tzinfo=mktz('Europe/London')) + }, + {'b': 3., + 'c': 4., + 'index': dt(2013, 1, 2, tzinfo=mktz('Europe/London')) + }, + {'b': 5., + 'c': 6., + 'index': dt(2013, 1, 3, tzinfo=mktz('Europe/London')) + }, + {'b': 7., + 'c': 8., + 'index': dt(2013, 1, 4, tzinfo=mktz('Europe/London')) + }, + {'b': 9., + 'c': 10., + 'index': dt(2013, 1, 5, tzinfo=mktz('Europe/London')) + }, + ] + + +def test_ts_write_simple(tickstore_lib): + assert tickstore_lib.stats()['chunks']['count'] == 0 + tickstore_lib.write('SYM', DUMMY_DATA) + assert tickstore_lib.stats()['chunks']['count'] == 1 + assert len(tickstore_lib.read('SYM')) == 5 + assert tickstore_lib.list_symbols() == ['SYM'] + + +def test_overlapping_load(tickstore_lib): + data = DUMMY_DATA + tickstore_lib.write('SYM', DUMMY_DATA) + with pytest.raises(OverlappingDataException): + tickstore_lib.write('SYM', data) + + data = DUMMY_DATA[2:] + with pytest.raises(OverlappingDataException): + tickstore_lib.write('SYM', data) + + data = DUMMY_DATA[2:3] + with pytest.raises(OverlappingDataException): + tickstore_lib.write('SYM', data) + + # overlapping at the beginning is ok + data = [DUMMY_DATA[0]] + tickstore_lib.write('SYM', data) + + # overlapping at the end is ok + data = [DUMMY_DATA[-1]] + tickstore_lib.write('SYM', data) + + +def test_ts_write_pandas(tickstore_lib): + data = DUMMY_DATA + tickstore_lib.write('SYM', data) + + data = tickstore_lib.read('SYM', columns=None).tz_localize(mktz('Europe/London')) + tickstore_lib.delete('SYM') + tickstore_lib.write('SYM', data) + + read = tickstore_lib.read('SYM', columns=None).tz_localize(mktz('Europe/London')) + assert_frame_equal(read, data, check_names=False) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/date/__init__.py b/tests/unit/date/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/date/test_daterange.py b/tests/unit/date/test_daterange.py new file mode 100644 index 000000000..54835a95f --- /dev/null +++ b/tests/unit/date/test_daterange.py @@ -0,0 +1,240 @@ +from datetime import datetime as dt +import operator +import pytest +import itertools + +from arctic.date import DateRange, string_to_daterange, CLOSED_CLOSED, CLOSED_OPEN, OPEN_CLOSED, OPEN_OPEN + + +test_ranges_for_bounding = { + "unbounded": (DateRange(), + None, None, True, None, None), + "unbounded_right": (DateRange('20110101'), + dt(2011, 1, 1), None, True, True, None), + "unbounded_left": (DateRange(None, '20111231'), + None, dt(2011, 12, 31), True, None, True), + "closed_by_default": (DateRange('20110101', '20111231'), + dt(2011, 1, 1), dt(2011, 12, 31), False, True, True), + "closed_explicitly": (DateRange('20110101', '20111231', CLOSED_CLOSED), + dt(2011, 1, 1), dt(2011, 12, 31), False, True, True), + "closed_open": (DateRange('20110101', '20111231', CLOSED_OPEN), + dt(2011, 1, 1), dt(2011, 12, 31), False, True, False), + "open_closed": (DateRange('20110101', '20111231', OPEN_CLOSED), + dt(2011, 1, 1), dt(2011, 12, 31), False, False, True), + "open_open": (DateRange('20110101', '20111231', OPEN_OPEN), + dt(2011, 1, 1), dt(2011, 12, 31), False, False, False), +} +test_ranges_for_bounding = sorted(test_ranges_for_bounding.iteritems(), key=operator.itemgetter(1)) + + +def eq_nan(*args): + if all(arg is None for arg in args): + return True + return all(arg == args[0] for arg in args[1:]) + + +@pytest.mark.parametrize(("dt_range", "start", "end", "is_unbounded", "start_in_range", "end_in_range"), + [i[1] for i in test_ranges_for_bounding], + ids=[i[0] for i in test_ranges_for_bounding]) +def test_daterange_bounding(dt_range, start, end, is_unbounded, start_in_range, end_in_range): + assert eq_nan(start, dt_range.start) + assert eq_nan(end, dt_range.end) + assert dt_range.unbounded is is_unbounded + assert dt_range.start is None or (start_in_range is (dt_range.start in dt_range)) + assert dt_range.end is None or (end_in_range is (dt_range.end in dt_range)) + + +test_ranges_for_parse = [ + [20110102, 20111231], + ['20110102', '20111231'], + ['2011-01-02', '2011-12-31'], + [dt(2011, 1, 2), dt(2011, 12, 31)], +] + +@pytest.mark.parametrize("date_range", test_ranges_for_parse) +def test_daterange_arg_parsing(date_range): + d1 = DateRange(date_range[0], date_range[1]) + assert d1.start == dt(2011, 1, 2) + assert d1.end == dt(2011, 12, 31) + assert d1.unbounded is False + + +def test_ambiguous_parse(): + with pytest.raises(ValueError): + DateRange('02/01/2011') + + +def test_daterange_eq(): + d1 = DateRange('20110101', '20111231') + d2 = DateRange('20110101', '20111231') + assert d1 == d2 + d1 = DateRange(None, '20111231') + d2 = DateRange(None, '20111231') + assert d1 == d2 + d1 = DateRange('20111231', None) + d2 = DateRange('20111231', None) + assert d1 == d2 + d1 = DateRange(None, None) + d2 = DateRange(None, None) + assert d1 == d2 + d1 = DateRange('20110102', '20111231') + d2 = DateRange('20110101', '20111231') + assert not d1 == d2 + + +def test_daterange_hash(): + d1 = DateRange('20110101', '20111231') + d2 = DateRange('20110101', '20111231') + assert hash(d1) == hash(d2) + d1 = DateRange(None, '20111231') + d2 = DateRange(None, '20111231') + assert hash(d1) == hash(d2) + d1 = DateRange('20111231', None) + d2 = DateRange('20111231', None) + assert hash(d1) == hash(d2) + d1 = DateRange(None, None) + d2 = DateRange(None, None) + assert hash(d1) == hash(d2) + d1 = DateRange('20110102', '20111231') + d2 = DateRange('20110101', '20111231') + assert not hash(d1) == hash(d2) + + +def test_daterange_invalid_start(): + with pytest.raises(TypeError) as ex: + DateRange(1.1, None) + assert "unsupported type for start" in str(ex.value) + + +def test_daterange_invalid_end(): + with pytest.raises(TypeError) as ex: + DateRange(None, object()) + assert "unsupported type for end" in str(ex.value) + + +def test_daterange_index(): + start, end = dt(2000, 1, 1), dt(3000, 1, 1) + dr = DateRange(start, end) + assert dr[0] == start + assert dr[1] == end + + +def test_daterange_index_error(): + start, end = dt(2000, 1, 1), dt(3000, 1, 1) + dr = DateRange(start, end) + with pytest.raises(IndexError): + dr[None] + with pytest.raises(IndexError): + dr[3] + + +def test_as_dates(): + """Various permutations of datetime/None, and date/None values.""" + dtime = dt(2010, 12, 13, 10, 30) + for testdt in [dtime, dtime.date()]: + vals = [testdt, None] + for start, end in itertools.product(vals, vals): + dr = DateRange(start, end) + dad = dr.as_dates() + if dr.start: + assert dad.start == dr.start.date() if isinstance(dr.start, dt) else dr.start + else: + assert not dad.start + if dr.end: + assert dad.end == dr.end.date() if isinstance(dr.end, dt) else dr.end + else: + assert not dad.end + + +DR1 = DateRange('20110101', '20110102') +DR2 = DateRange('201101011030', '201101021030') +DR3 = DateRange('201101011030') +DR4 = DateRange(None, '201101011030') +DR5 = DateRange('201101011030') +DR6 = DateRange('20110101', '20110102', OPEN_OPEN) +DR7 = DateRange('20110101', '20110102', OPEN_CLOSED) +DR7 = DateRange('20110101', '20110102', CLOSED_OPEN) + +STRING_DR_TESTS = [('20110101', DR1, DateRange(DR1.start.date(), DR1.end.date())), + ('20110101-20110102', DR1, DateRange(DR1.start.date(), DR1.end.date())), + ('201101011030', DR2, DateRange(DR2.start.date(), DR2.end.date())), + ('-201101011030', DR4, DateRange(None, DR2.start.date())), + ('201101011030-', DR5, DateRange(DR2.start.date())), + ('(20110101-20110102)', DR6, DateRange(DR6.start.date(), DR6.end.date(), DR6.interval)), + ('(20110101-20110102]', DR6, DateRange(DR6.start.date(), DR6.end.date(), DR6.interval)), + ('[20110101-20110102)', DR6, DateRange(DR6.start.date(), DR6.end.date(), DR6.interval)), + ('[20110101-20110102]', DR1, DateRange(DR1.start.date(), DR1.end.date(), DR1.interval)), + ] + + +@pytest.mark.parametrize(['instr', 'expected_ts', 'expected_dt'], STRING_DR_TESTS) +def test_string_to_daterange(instr, expected_ts, expected_dt): + assert string_to_daterange(instr) == expected_ts + assert string_to_daterange(instr, as_dates=True) == expected_dt + + +def test_string_to_daterange_raises(): + with pytest.raises(ValueError) as e: + string_to_daterange('20120101-20130101-20140101') + assert str(e.value) == "Too many dates in input string [20120101-20130101-20140101] with delimiter (-)" + +QUERY_TESTS = [(DateRange('20110101', '20110102'), {'$gte': dt(2011, 1, 1), '$lte': dt(2011, 1, 2)}), + (DateRange('20110101', '20110102', OPEN_OPEN), {'$gt': dt(2011, 1, 1), '$lt': dt(2011, 1, 2)}), + (DateRange('20110101', '20110102', OPEN_CLOSED), {'$gt': dt(2011, 1, 1), '$lte': dt(2011, 1, 2)}), + (DateRange('20110101', '20110102', CLOSED_OPEN), {'$gte': dt(2011, 1, 1), '$lt': dt(2011, 1, 2)}), + (DateRange('20110101', '20110102'), {'$gte': dt(2011, 1, 1), '$lte': dt(2011, 1, 2)}), + (DateRange('20110101', None), {'$gte': dt(2011, 1, 1)}), + (DateRange(None, '20110102'), {'$lte': dt(2011, 1, 2)}), + (DateRange(), {})] + + +@pytest.mark.parametrize(['date_range', 'expected'], QUERY_TESTS) +def test_mongo_query(date_range, expected): + assert date_range.mongo_query() == expected + + +QUERY_TESTS_DB = [(DateRange('20110101', '20110102'), ('>=', dt(2011, 1, 1), '<=', dt(2011, 1, 2))), + (DateRange('20110101', '20110102', OPEN_OPEN), ('>', dt(2011, 1, 1), '<', dt(2011, 1, 2))), + (DateRange('20110101', '20110102', OPEN_CLOSED), ('>', dt(2011, 1, 1), '<=', dt(2011, 1, 2))), + (DateRange('20110101', '20110102', CLOSED_OPEN), ('>=', dt(2011, 1, 1), '<', dt(2011, 1, 2))), + (DateRange('20110101', '20110102'), ('>=', dt(2011, 1, 1), '<=', dt(2011, 1, 2))), + (DateRange('20110101', None), ('>=', dt(2011, 1, 1), '<=' , None)), + (DateRange(None, '20110102'), ('>=', None, '<=', dt(2011, 1, 2))), + (DateRange(), ('>=', None , '<=' , None))] +@pytest.mark.parametrize(['date_range', 'expected'], QUERY_TESTS_DB) +def test_get_date_bounds(date_range, expected): + assert date_range.get_date_bounds() == expected + + +@pytest.mark.parametrize(["dr"], [(DR1,), (DR2,), (DR3,), (DR4,), (DR5,), (DR6,), (DR7,)]) +def test_intersection_with_self(dr): + assert dr == dr.intersection(dr) + + +def test_intersection_returns_inner_boundaries(): + # #start: + assert DateRange('20110103',).intersection(DateRange('20110102')).start == dt(2011, 1, 3) + assert DateRange('20110102',).intersection(DateRange('20110103')).start == dt(2011, 1, 3) + assert DateRange(None,).intersection(DateRange('20110103')).start == dt(2011, 1, 3) + assert DateRange('20110103').intersection(DateRange(None)).start == dt(2011, 1, 3) + + # #end: + assert DateRange(None, '20110103',).intersection(DateRange(None, '20110102')).end == dt(2011, 1, 2) + assert DateRange(None, '20110102',).intersection(DateRange(None, '20110103')).end == dt(2011, 1, 2) + assert DateRange(None, None,).intersection(DateRange(None, '20110103')).end == dt(2011, 1, 3) + assert DateRange(None, '20110103').intersection(DateRange(None, None)).end == dt(2011, 1, 3) + + +def test_intersection_preserves_boundaries(): + # Non-matching boundaries + assert DateRange('20110101', '20110102', OPEN_OPEN) == DateRange('20110101', '20110103', OPEN_CLOSED).intersection(DateRange('20110101', '20110102', OPEN_OPEN)) + assert DateRange('20110101', '20110102', OPEN_OPEN) == DateRange('20110101', '20110102', OPEN_OPEN).intersection(DateRange('20110101', '20110103', OPEN_CLOSED)) + assert DateRange('20110102', '20110103', OPEN_OPEN) == DateRange('20110102', '20110103', OPEN_OPEN).intersection(DateRange('20110101', '20110103', CLOSED_OPEN)) + + assert DateRange('20110102', '20110103', CLOSED_OPEN) == DateRange('20110102', '20110103', CLOSED_OPEN).intersection(DateRange('20110101', '20110103', CLOSED_OPEN)) + assert DateRange('20110102', '20110103', CLOSED_OPEN) == DateRange('20110101', '20110103', CLOSED_OPEN).intersection(DateRange('20110102', '20110103', CLOSED_OPEN)) + + # Matching boundaries + assert DateRange('20110101', '20110102', OPEN_OPEN) == DateRange('20110101', '20110102', CLOSED_OPEN).intersection(DateRange('20110101', '20110102', OPEN_OPEN)) + assert DateRange('20110101', '20110102', OPEN_OPEN) == DateRange('20110101', '20110102', OPEN_OPEN).intersection(DateRange('20110101', '20110102', OPEN_CLOSED)) + diff --git a/tests/unit/date/test_datetime_to_ms_roundtrip.py b/tests/unit/date/test_datetime_to_ms_roundtrip.py new file mode 100644 index 000000000..e0358b657 --- /dev/null +++ b/tests/unit/date/test_datetime_to_ms_roundtrip.py @@ -0,0 +1,80 @@ +import pytest +import datetime +from datetime import datetime as dt +import pytz +from arctic.date import mktz, datetime_to_ms, ms_to_datetime + + +def assert_roundtrip(tz): + ts = datetime.datetime(1982, 7, 1, 16, 5) + + ts1 = ts.replace(tzinfo=tz) + ts2 = ms_to_datetime(datetime_to_ms(ts1.astimezone(mktz("UTC"))), tz) + ts1 = ts1.replace(tzinfo=None) if tz == mktz() else ts1 + #logger.info(ts2.tzinfo) + + assert(ts2.hour == ts1.hour) +# assert(ts2.tzinfo == ts1.tzinfo) + assert ts2 == ts1 + + +def get_tz(): + #tz = mktz("Europe/London") + #tz = pytz.timezone("Europe/London") + #tz = pytz.timezone("UTC") + tz = pytz.timezone("Europe/London") + tmp = ms_to_datetime(0, tz) + tz = tmp.tzinfo + return tz + + +def test_UTC_roundtrip(): + tz = pytz.timezone("UTC") + assert_roundtrip(tz) + + +def test_weird_get_tz_London(): + tz = get_tz() + assert_roundtrip(tz) + + +@pytest.mark.xfail +def test_pytz_London(): + # Don't use pytz + tz = pytz.timezone("Europe/London") + assert_roundtrip(tz) + + +def test_mktz_London(): + tz = mktz("Europe/London") + assert_roundtrip(tz) + + +def test_datetime_roundtrip_lon_no_tz(): + pdt = datetime.datetime(2012, 6, 12, 12, 12, 12, 123000) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2 == pdt + + pdt = datetime.datetime(2012, 1, 12, 12, 12, 12, 123000) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2 == pdt + + +def test_datetime_roundtrip_lon_tz(): + pdt = datetime.datetime(2012, 6, 12, 12, 12, 12, 123000, tzinfo=mktz('Europe/London')) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2 == pdt.replace(tzinfo=None) + + pdt = datetime.datetime(2012, 1, 12, 12, 12, 12, 123000, tzinfo=mktz('Europe/London')) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2 == pdt.replace(tzinfo=None) + + +def test_datetime_roundtrip_est_tz(): + pdt = datetime.datetime(2012, 6, 12, 12, 12, 12, 123000, tzinfo=mktz('EST')) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2.replace(tzinfo=mktz('Europe/London')) == pdt + + pdt = datetime.datetime(2012, 1, 12, 12, 12, 12, 123000, tzinfo=mktz('EST')) + pdt2 = ms_to_datetime(datetime_to_ms(pdt)) + assert pdt2.replace(tzinfo=mktz('Europe/London')) == pdt diff --git a/tests/unit/date/test_mktz.py b/tests/unit/date/test_mktz.py new file mode 100644 index 000000000..fc53918c2 --- /dev/null +++ b/tests/unit/date/test_mktz.py @@ -0,0 +1,33 @@ +from datetime import datetime as dt +from mock import patch +from pytest import raises + +from arctic.date import mktz, TimezoneError + + +def test_mktz(): + tz = mktz() + d = dt(2012, 2, 2, tzinfo=tz) + assert d.tzname() == 'GMT' + d = dt(2012, 7, 2, tzinfo=tz) + assert d.tzname() == 'BST' + + tz = mktz('UTC') + d = dt(2012, 2, 2, tzinfo=tz) + assert d.tzname() == 'UTC' + d = dt(2012, 7, 2, tzinfo=tz) + assert d.tzname() == 'UTC' # --------replace_empty_timezones_with_default ----------------- + + +def test_mktz_zone(): + tz = mktz('UTC') + assert tz.zone == "UTC" + tz = mktz('/usr/share/zoneinfo/UTC') + assert tz.zone == "UTC" + + +def test_mktz_fails_if_invalid_timezone(): + with patch('os.path.exists') as file_exists: + file_exists.return_value = False + with raises(TimezoneError): + mktz('junk') diff --git a/tests/unit/date/test_util.py b/tests/unit/date/test_util.py new file mode 100644 index 000000000..236fb201b --- /dev/null +++ b/tests/unit/date/test_util.py @@ -0,0 +1,54 @@ +import pytest +import pytz + +from datetime import datetime as dt +from arctic.date import datetime_to_ms, ms_to_datetime, mktz, to_pandas_closed_closed, DateRange, OPEN_OPEN, CLOSED_CLOSED + + +@pytest.mark.parametrize('pdt', [ + dt(2007, 3, 25, 1, tzinfo=mktz('Europe/London')), + dt(2004, 10, 31, 23, 3, tzinfo=mktz('Europe/London')), + dt(1990, 4, 5, 0, 0, tzinfo=mktz('Europe/London')), + dt(2007, 3, 25, 1, tzinfo=mktz('EST')), + dt(2004, 10, 31, 23, 3, tzinfo=mktz('EST')), + dt(1990, 4, 5, 0, 0, tzinfo=mktz('EST')), + ] +) +def test_datetime_to_ms_and_back(pdt): + i = datetime_to_ms(pdt) + pdt = pdt.astimezone(mktz()) + pdt = pdt.replace(tzinfo=None) + pdt2 = ms_to_datetime(i) + assert pdt == pdt2 + + +def test_datetime_to_ms_and_back_microseconds(): + pdt = dt(2012, 8, 1, 12, 34, 56, 999999, tzinfo=mktz('Europe/London')) + i = datetime_to_ms(pdt) + pdt = pdt.replace(tzinfo=None) + pdt2 = ms_to_datetime(i) + + assert pdt != pdt2 + assert pdt.year == pdt2.year + assert pdt.month == pdt2.month + assert pdt.day == pdt2.day + assert pdt.hour == pdt2.hour + assert pdt.minute == pdt2.minute + assert pdt.second == pdt2.second + # Microsecond precision loss inevitable. + assert pdt.microsecond // 1000 == pdt2.microsecond // 1000 + assert pdt.tzinfo is None + + +def test_daterange_closedclosed_None(): + assert to_pandas_closed_closed(None) is None + + +def test_daterange_closedclosed(): + date_range = DateRange(dt(2013, 1, 1, tzinfo=mktz('Europe/London')), + dt(2014, 2, 1, tzinfo=mktz('Europe/London')), OPEN_OPEN) + expected = DateRange(dt(2013, 1, 1, 0, 0, 0, 1000, tzinfo=mktz('Europe/London')), + dt(2014, 1, 31, 23, 59, 59, 999000, tzinfo=mktz('Europe/London')), + CLOSED_CLOSED) + act = to_pandas_closed_closed(date_range) + assert act == expected diff --git a/tests/unit/scripts/__init__.py b/tests/unit/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/scripts/test_arctic_fsck.py b/tests/unit/scripts/test_arctic_fsck.py new file mode 100644 index 000000000..5f45569a0 --- /dev/null +++ b/tests/unit/scripts/test_arctic_fsck.py @@ -0,0 +1,36 @@ +from mock import patch, sentinel, call + +from arctic.scripts.arctic_fsck import main + +from ...util import run_as_main + + +def test_main(): + with patch('arctic.scripts.arctic_fsck.Arctic') as Arctic, \ + patch('arctic.scripts.arctic_fsck.get_mongodb_uri') as get_mongodb_uri, \ + patch('arctic.scripts.arctic_fsck.do_db_auth') as do_db_auth: + run_as_main(main, '--host', '%s:%s' % (sentinel.host, sentinel.port), + '-v', '--library', 'sentinel.library', 'lib2', '-f') + get_mongodb_uri.assert_called_once_with('sentinel.host:sentinel.port') + Arctic.assert_called_once_with(get_mongodb_uri.return_value) + assert do_db_auth.call_args_list == [call('%s:%s' % (sentinel.host, sentinel.port), + Arctic.return_value._conn, + 'arctic_sentinel'), + call('%s:%s' % (sentinel.host, sentinel.port), + Arctic.return_value._conn, + 'arctic')] + assert Arctic.return_value.__getitem__.return_value._fsck.call_args_list == [call(False), + call(False), ] + + +def test_main_dry_run(): + with patch('arctic.scripts.arctic_fsck.Arctic') as Arctic, \ + patch('arctic.scripts.arctic_fsck.get_mongodb_uri') as get_mongodb_uri, \ + patch('arctic.scripts.arctic_fsck.do_db_auth') as do_db_auth: + run_as_main(main, '--host', '%s:%s' % (sentinel.host, sentinel.port), + '-v', '--library', 'sentinel.library', 'sentinel.lib2') + get_mongodb_uri.assert_called_once_with('sentinel.host:sentinel.port') + Arctic.assert_called_once_with(get_mongodb_uri.return_value) + assert do_db_auth.call_count == 0 + assert Arctic.return_value.__getitem__.return_value._fsck.call_args_list == [call(True), + call(True), ] diff --git a/tests/unit/scripts/test_initialize_library.py b/tests/unit/scripts/test_initialize_library.py new file mode 100644 index 000000000..76866efc7 --- /dev/null +++ b/tests/unit/scripts/test_initialize_library.py @@ -0,0 +1,87 @@ +from mock import patch +import pytest + +from arctic.scripts import arctic_init_library as mil + +from ...util import run_as_main + + +def test_init_library(): + # Create the user agains the current mongo database + with patch('pymongo.MongoClient') as MongoClient, \ + patch('arctic.scripts.arctic_init_library.logger', autospec=True) as logger, \ + patch('arctic.scripts.arctic_init_library.Arctic', autospec=True) as Arctic, \ + patch('arctic.scripts.arctic_init_library.get_mongodb_uri', autospec=True) as get_mongodb_uri, \ + patch('arctic.scripts.arctic_init_library.do_db_auth', autospec=True) as do_db_auth: + run_as_main(mil.main, '--host', 'hostname', '--library', 'arctic_user.library', '--type', 'VersionStore') + + get_mongodb_uri.assert_called_once_with('hostname') + MongoClient.assert_called_once_with(get_mongodb_uri.return_value) + do_db_auth.assert_called_once_with('hostname', MongoClient.return_value, 'arctic_user') + Arctic.assert_called_once_with(MongoClient.return_value) + Arctic.return_value.initialize_library.assert_called_once_with('arctic_user.library', 'VersionStore', hashed=False) + assert logger.warn.call_count == 0 + + +def test_init_library_no_admin(): + # Create the user agains the current mongo database + with patch('pymongo.MongoClient') as MongoClient, \ + patch('arctic.scripts.arctic_init_library.logger', autospec=True), \ + patch('arctic.scripts.arctic_init_library.Arctic', autospec=True) as Arctic, \ + patch('arctic.scripts.arctic_init_library.get_mongodb_uri', autospec=True) as get_mongodb_uri, \ + patch('arctic.scripts.arctic_init_library.do_db_auth', autospec=True) as do_db_auth: + run_as_main(mil.main, '--host', 'hostname', '--library', 'arctic_user.library', '--type', 'VersionStore') + + get_mongodb_uri.assert_called_once_with('hostname') + MongoClient.assert_called_once_with(get_mongodb_uri.return_value) + Arctic.assert_called_once_with(MongoClient.return_value) + Arctic.return_value.initialize_library.assert_called_once_with('arctic_user.library', 'VersionStore', hashed=False) + + +def test_init_library_hashed(): + # Create the user agains the current mongo database + with patch('pymongo.MongoClient') as MongoClient, \ + patch('arctic.scripts.arctic_init_library.logger', autospec=True) as logger, \ + patch('arctic.scripts.arctic_init_library.Arctic', autospec=True) as Arctic, \ + patch('arctic.scripts.arctic_init_library.get_mongodb_uri', autospec=True) as get_mongodb_uri, \ + patch('arctic.scripts.arctic_init_library.do_db_auth', autospec=True) as do_db_auth: + run_as_main(mil.main, '--host', 'hostname', '--library', 'arctic_user.library', '--type', 'VersionStore', '--hashed') + + get_mongodb_uri.assert_called_once_with('hostname') + MongoClient.assert_called_once_with(get_mongodb_uri.return_value) + do_db_auth.assert_called_once_with('hostname', MongoClient.return_value, 'arctic_user') + Arctic.assert_called_once_with(MongoClient.return_value) + Arctic.return_value.initialize_library.assert_called_once_with('arctic_user.library', 'VersionStore', hashed=True) + assert logger.warn.call_count == 0 + + +def test_init_library_no_admin_no_user_creds(): + with patch('pymongo.MongoClient') as MongoClient, \ + patch('arctic.scripts.arctic_init_library.logger', autospec=True) as logger, \ + patch('arctic.scripts.arctic_init_library.Arctic', autospec=True) as Arctic, \ + patch('arctic.scripts.arctic_init_library.get_mongodb_uri', autospec=True) as get_mongodb_uri, \ + patch('arctic.scripts.arctic_init_library.do_db_auth', autospec=True, return_value=False) as do_db_auth: + + MongoClient.return_value['arctic_user'].authenticate.return_value = False + run_as_main(mil.main, '--host', 'hostname', '--library', 'arctic_user.library', '--type', 'VersionStore') + + get_mongodb_uri.assert_called_once_with('hostname') + MongoClient.assert_called_once_with(get_mongodb_uri.return_value) + assert Arctic.call_count == 0 + + +def test_bad_library_name(): + with pytest.raises(Exception): + with patch('argparse.ArgumentParser.error', side_effect=Exception) as error: + run_as_main(mil.main, '--library', 'user.library') + error.assert_called_once_with('Must specify the full path of the library e.g. arctic_jblackburn.library!') + + with pytest.raises(Exception): + with patch('argparse.ArgumentParser.error', side_effect=Exception) as error: + run_as_main(mil.main, '--library', 'arctic_jblackburn') + error.assert_called_once_with('Must specify the full path of the library e.g. arctic_jblackburn.library!') + + with pytest.raises(Exception): + with patch('argparse.ArgumentParser.error', side_effect=Exception) as error: + run_as_main(mil.main) + error.assert_called_once_with('Must specify the full path of the library e.g. arctic_jblackburn.library!') diff --git a/tests/unit/scripts/test_utils.py b/tests/unit/scripts/test_utils.py new file mode 100644 index 000000000..d377d117d --- /dev/null +++ b/tests/unit/scripts/test_utils.py @@ -0,0 +1,105 @@ +from mock import patch, Mock, call, sentinel, MagicMock +import pytest + +from arctic.scripts import arctic_init_library as mil +from arctic.scripts.utils import do_db_auth +from ...util import run_as_main + + +def test_do_db_auth(): + # Create the user agains the current mongo database + admin_creds = Mock() + user_creds = Mock() + connection = MagicMock() + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', autospec=True, side_effect=[admin_creds, user_creds]) as get_auth: + assert do_db_auth('hostname', connection, 'arctic_user') + + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + connection.admin.authenticate.assert_called_once_with(admin_creds.user, + admin_creds.password) + # Must also ensure that we auth against the user's db too ; the user + # may well have read-only access to the admin database, but not to their user_db! + connection.__getitem__.assert_called_once_with('arctic_user') + connection.__getitem__.return_value.authenticate.assert_called_once_with(user_creds.user, user_creds.password) + assert logger.error.call_count == 0 + + +def test_do_db_auth_no_admin(): + user_creds = Mock() + connection = MagicMock() + # Create the user agains the current mongo database + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', side_effect=[None, user_creds], + autospec=True) as get_auth: + + connection.admin.authenticate.return_value = False + assert do_db_auth('hostname', connection, 'arctic_user') + + assert logger.call_count == 0 + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + connection['arctic_user'].authenticate.assert_called_once_with(user_creds.user, user_creds.password) + + +def test_do_db_auth_no_user_creds(): + user_creds = Mock() + connection = MagicMock() + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', side_effect=[None, user_creds], + autospec=True) as get_auth: + connection['arctic_user'].authenticate.return_value = False + assert not do_db_auth('hostname', connection, 'arctic_user') + + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + logger.error.assert_called_once_with("Failed to authenticate to db 'arctic_user' on 'hostname'," + " using user credentials") + + +def test_do_db_auth_no_admin_user_creds_fails(): + connection = MagicMock() + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', side_effect=[None, None], + autospec=True) as get_auth: + connection.admin.authenticate.return_value = False + assert not do_db_auth('hostname', connection, 'arctic_user') + + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + logger.error.assert_called_once_with("You need credentials for db 'arctic_user' on 'hostname'," + " or admin credentials") + + +def test_do_db_auth_admin_user_creds_fails(): + connection = MagicMock() + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', side_effect=[Mock(), None], + autospec=True) as get_auth: + connection.admin.authenticate.return_value = False + assert not do_db_auth('hostname', connection, 'arctic_user') + + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + logger.error.assert_called_once_with("Failed to authenticate to '%s' as Admin. Giving up." % ('hostname')) + + +def test_do_db_auth_role(): + # Create the user agains the current mongo database + admin_creds = Mock() + user_creds = Mock() + connection = MagicMock() + with patch('arctic.scripts.utils.logger', autospec=True) as logger, \ + patch('arctic.scripts.utils.get_auth', autospec=True, side_effect=[admin_creds, user_creds]) as get_auth: + assert do_db_auth('hostname', connection, 'arctic_user') + + assert get_auth.call_args_list == [call('hostname', 'admin', 'admin'), + call('hostname', 'arctic', 'arctic_user')] + connection.admin.authenticate.assert_called_once_with(admin_creds.user, + admin_creds.password) + # Must also ensure that we auth against the user's db too ; the user + # may well have read-only access to the admin database, but not to their user_db! + connection.__getitem__.assert_called_once_with('arctic_user') + connection.__getitem__.return_value.authenticate.assert_called_once_with(user_creds.user, user_creds.password) + assert logger.error.call_count == 0 diff --git a/tests/unit/store/__init__.py b/tests/unit/store/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/store/test_ndarray_store.py b/tests/unit/store/test_ndarray_store.py new file mode 100644 index 000000000..89636b75a --- /dev/null +++ b/tests/unit/store/test_ndarray_store.py @@ -0,0 +1,66 @@ +import numpy as np +from pytest import raises +from arctic.store._ndarray_store import NdarrayStore, _promote_struct_dtypes + + +def test_dtype_parsing(): + store = NdarrayStore() + dtypes = [] + + dtypes.append(np.dtype(np.object_)) + dtypes.append(np.dtype(np.float128)) + dtypes.append(np.dtype('int64')) + dtypes.append(np.dtype([('A', 'int64')])) + dtypes.append(np.dtype([('A', 'int64'), ('B', '1 dimensional arrays, saving as Blob') + store.to_records.assert_called_once_with(sentinel.df) + + +def test_can_convert_to_records_without_objects_returns_true_otherwise(): + store = PandasStore() + store.to_records = Mock(return_value=np.rec.array([(1356998400000000000L, 'a')], + dtype=[('index', ',version=1.0,metadata={'metadata': 'foo'}" + assert str(item) == expected + assert repr(item) == expected + + +def test_versioned_item_str_handles_none(): + item = VersionedItem(symbol=None, + library=None, + data=None, + version=None, + metadata=None) + + assert str(item) diff --git a/tests/unit/store/test_version_store.py b/tests/unit/store/test_version_store.py new file mode 100644 index 000000000..1caa22da2 --- /dev/null +++ b/tests/unit/store/test_version_store.py @@ -0,0 +1,175 @@ +import bson +import datetime +from datetime import datetime as dt, timedelta as dtd +from mock import patch, MagicMock, sentinel, create_autospec, Mock, call, ANY +import pytest + +import pymongo +from pymongo import ReadPreference + +from arctic.date import mktz +from arctic.store import version_store +from arctic.store.version_store import VersionStore, VersionedItem +from arctic.arctic import ArcticLibraryBinding, Arctic +from arctic.exceptions import ConcurrentModificationException +from pymongo.errors import OperationFailure +from pymongo.collection import Collection + + +def test_delete_version_version_not_found(): + with patch('arctic.store.version_store.VersionStore.__init__', return_value=None, autospec=True): + with patch('arctic.store.version_store.logger') as logger: + vs = version_store.VersionStore(sentinel.connection) + vs._versions = MagicMock() + with patch.object(vs._versions, 'find_one', return_value=None, autospec=True): + vs._delete_version(sentinel.symbol, sentinel.version) + logger.error.assert_called_once_with("Can't delete sentinel.symbol:sentinel.version as not found in DB") + + +def test_list_versions_LondonTime(): + # Object ID's are stored in UTC. We need to ensure that the returned times + # for versions are in the local London TimeZone + vs = create_autospec(VersionStore, instance=True, + _versions=Mock()) + vs._find_snapshots.return_value = 'snap' + vs._versions.find.return_value = [{'_id': bson.ObjectId.from_datetime(dt(2013, 4, 1, 9, 0)), + 'symbol': 's', 'version': 10}] + + version = list(VersionStore.list_versions(vs, "symbol"))[0] + assert version == {'symbol': version['symbol'], 'version': version['version'], + # We return naive datetimes in 'default' time, which is London for us + 'date': dt(2013, 4, 1, 10, 0), + 'snapshots': 'snap'} + + +def test_read_as_of_LondonTime(): + # When we do a read, with naive as_of, that as_of is treated in London Time. + vs = create_autospec(VersionStore, instance=True, + _versions=Mock(), _allow_secondary=False) + VersionStore._read_metadata(vs, 'symbol', dt(2013, 4, 1, 9, 0)) + versions = vs._versions.with_options.return_value + versions.find_one.assert_called_once_with({'symbol':'symbol', '_id': + {'$lt': bson.ObjectId.from_datetime(dt(2013, 4, 1, 9, 0, tzinfo=mktz()) + dtd(seconds=1))}}, + sort=[('_id', pymongo.DESCENDING)]) + + +def test_read_as_of_NotNaive(): + # When we do a read, with naive as_of, that as_of is treated in London Time. + vs = create_autospec(VersionStore, instance=True, + _versions=Mock(), _allow_secondary=False) + VersionStore._read_metadata(vs, 'symbol', dt(2013, 4, 1, 9, 0, tzinfo=mktz('Europe/Paris'))) + versions = vs._versions.with_options.return_value + versions.find_one.assert_called_once_with({'symbol':'symbol', '_id': + {'$lt': bson.ObjectId.from_datetime(dt(2013, 4, 1, 9, 0, tzinfo=mktz('Europe/Paris')) + dtd(seconds=1))}}, + sort=[('_id', pymongo.DESCENDING)]) + + +def test_read_metadata_no_asof(): + # When we do a read, with naive as_of, that as_of is treated in London Time. + vs = create_autospec(VersionStore, instance=True, + _versions=Mock(), _allow_secondary=False) + VersionStore._read_metadata(vs, sentinel.symbol) + versions = vs._versions.with_options.return_value + assert versions.find_one.call_args_list == [call({'symbol': sentinel.symbol}, + sort=[('version', pymongo.DESCENDING)])] + + +def test_write_ensure_index(): + write_handler = Mock(write=Mock(__name__="")) + vs = create_autospec(VersionStore, instance=True, + _collection=Mock(), + _version_nums=Mock(find_one_and_update=Mock(return_value={'version':1})), + _versions=Mock(insert_one=lambda x:None), + _arctic_lib=Mock(), + _publish_changes=False) + vs._collection.database.connection.nodes = [] + vs._write_handler.return_value = write_handler + VersionStore.write(vs, 'sym', sentinel.data, prune_previous_version=False) + vs._ensure_index.assert_called_once_with() + + +def test_write_check_quota(): + write_handler = Mock(write=Mock(__name__="")) + vs = create_autospec(VersionStore, instance=True, + _collection=Mock(), + _version_nums=Mock(find_one_and_update=Mock(return_value={'version':1})), + _versions=Mock(insert_one=lambda x:None), + _arctic_lib=create_autospec(ArcticLibraryBinding), + _publish_changes=False) + vs._collection.database.connection.nodes = [] + vs._write_handler.return_value = write_handler + VersionStore.write(vs, 'sym', sentinel.data, prune_previous_version=False) + assert vs._arctic_lib.check_quota.call_count == 1 + + +def test_initialize_library(): + arctic_lib = create_autospec(ArcticLibraryBinding) + arctic_lib.arctic = create_autospec(Arctic, _allow_secondary=False) + with patch('arctic.store.version_store.enable_powerof2sizes', autospec=True) as enable_powerof2sizes, \ + patch('arctic.store.version_store.enable_sharding', autospec=True) as enable_sharding: + arctic_lib.get_top_level_collection.return_value.database.create_collection.__name__ = 'some_name' + arctic_lib.get_top_level_collection.return_value.database.collection_names.__name__ = 'some_name' + VersionStore.initialize_library(arctic_lib, hashed=sentinel.hashed) + assert enable_powerof2sizes.call_args_list == [call(arctic_lib.arctic, arctic_lib.get_name())] + assert enable_sharding.call_args_list == [call(arctic_lib.arctic, arctic_lib.get_name(), hashed=sentinel.hashed)] + + +def test_ensure_index(): + th = Mock() + vs = create_autospec(VersionStore, _collection=Mock()) + with patch('arctic.store.version_store._TYPE_HANDLERS', [th]): + VersionStore._ensure_index(vs) + assert vs._collection.snapshots.create_index.call_args_list == [call([('name', 1)], unique=True, background=True)] + assert vs._collection.versions.create_index.call_args_list == [call([('symbol', 1), ('_id', -1)], background=True), + call([('symbol', 1), ('version', -1)], unique=True, background=True)] + assert vs._collection.version_nums.create_index.call_args_list == [call('symbol', unique=True, background=True)] + th._ensure_index.assert_called_once_with(vs._collection) + + +def test_prune_previous_versions_0_timeout(): + self = create_autospec(VersionStore, _versions=Mock()) + self.name = sentinel.name + self._versions = create_autospec(Collection) + self._versions.with_options.return_value.find.__name__ = 'find' + self._versions.with_options.return_value.find.return_value = [] + with patch('arctic.store.version_store.dt') as dt: + dt.utcnow.return_value = datetime.datetime(2013, 10, 1) + VersionStore._prune_previous_versions(self, sentinel.symbol, keep_mins=0) + assert self._versions.with_options.call_args_list == [call(read_preference=ReadPreference.PRIMARY)] + assert self._versions.with_options.return_value.find.call_args_list == [ + call({'$or': [{'parent': {'$exists': False}}, + {'parent': {'$size': 0}}], + 'symbol': sentinel.symbol, + '_id': {'$lt': bson.ObjectId('524a10810000000000000000')}}, + sort=[('version', -1)], + skip=1, + projection=['_id', 'type'])] + + +def test_read_handles_operation_failure(): + self = create_autospec(VersionStore, _versions=Mock(), _arctic_lib=Mock(), + _allow_secondary=True) + self._collection = create_autospec(Collection) + self._read_metadata.side_effect = [sentinel.meta1, sentinel.meta2] + self._read_metadata.__name__ = 'name' + self._do_read.__name__ = 'name' # feh: mongo_retry decorator cares about this + self._do_read.side_effect = [OperationFailure('error'), sentinel.read] + VersionStore.read(self, sentinel.symbol, sentinel.as_of, sentinel.from_version) + # Assert that, for the two read calls, the second uses the new metadata + assert self._do_read.call_args_list == [call(sentinel.symbol, sentinel.meta1, sentinel.from_version, + read_preference=ReadPreference.NEAREST)] + assert self._do_read_retry.call_args_list == [call(sentinel.symbol, sentinel.meta2, sentinel.from_version, + read_preference=ReadPreference.PRIMARY)] + + +def test_read_reports_random_errors(): + self = create_autospec(VersionStore, _versions=Mock(), _arctic_lib=Mock(), + _allow_secondary=True) + self._collection = create_autospec(Collection) + self._do_read.__name__ = 'name' # feh: mongo_retry decorator cares about this + self._do_read.side_effect = Exception('bad') + with pytest.raises(Exception) as e: + with patch('arctic.store.version_store.log_exception') as le: + VersionStore.read(self, sentinel.symbol, sentinel.as_of, sentinel.from_version) + assert 'bad' in str(e) + assert le.call_count == 1 diff --git a/tests/unit/store/test_version_store_audit.py b/tests/unit/store/test_version_store_audit.py new file mode 100644 index 000000000..34977446f --- /dev/null +++ b/tests/unit/store/test_version_store_audit.py @@ -0,0 +1,186 @@ +from mock import create_autospec, Mock, sentinel, ANY, call +from pymongo.errors import OperationFailure +import pytest +import pandas as pd + +from arctic.store.audit import ArcticTransaction +from arctic.store.version_store import VersionedItem, VersionStore +from arctic.exceptions import ConcurrentModificationException, NoDataFoundException + + +def test_ConcurrentWriteBlock_simple(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, + metadata=None, data=None) + vs.list_versions.return_value = [{'version': 2}, {'version': 1}] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, pd.DataFrame(index=[3, 4], data={'a': [1.0, 2.0]}), metadata=sentinel.meta) + + assert not vs._delete_version.called + vs.write.assert_called_once_with(sentinel.symbol, ANY, prune_previous_version=True, metadata=sentinel.meta) + vs.list_versions.assert_called_once_with(sentinel.symbol) + + +def test_ConcurrentWriteBlock_writes_if_metadata_changed(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, metadata=None, data=None) + vs.list_versions.return_value = [{'version': 2}, + {'version': 1}] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + assert cwb._do_write is False + cwb.write(sentinel.symbol, ts1, metadata={1: 2}) + assert cwb._do_write is True + + assert not vs._delete_version.called + vs.write.assert_called_once_with(sentinel.symbol, ANY, prune_previous_version=True, metadata={1: 2}) + vs.list_versions.assert_called_once_with(sentinel.symbol) + + # Won't write on exit with same data and metadata + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, metadata={1: 2}, data=ts1) + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + assert cwb._do_write is False + cwb.write(sentinel.symbol, ts1, metadata={1: 2}) + assert cwb._do_write is False + + +def test_ConcurrentWriteBlock_writes_if_base_data_corrupted(): + + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.side_effect = OperationFailure('some failure') + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, + metadata=None, data=None) + vs.read_metadata.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, + metadata=None, data=None) + vs.list_versions.return_value = [{'version': 2}, {'version': 1}] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, ts1, metadata={1: 2}) + + vs.write.assert_called_once_with(sentinel.symbol, ANY, prune_previous_version=True, metadata={1: 2}) + assert vs.list_versions.call_args_list == [call(sentinel.symbol)] + + +def test_ConcurrentWriteBlock_writes_no_data_found(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.side_effect = NoDataFoundException('no data') + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, + metadata=None, data=None) + vs.list_versions.side_effect = [[], + [{'version': 1}], + ] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, ts1, metadata={1: 2}) + + assert vs.write.call_args_list == [call(sentinel.symbol, ANY, prune_previous_version=True, metadata={1: 2})] + assert vs.list_versions.call_args_list == [call(sentinel.symbol, latest_only=True), + call(sentinel.symbol)] + + +def test_ConcurrentWriteBlock_writes_no_data_found_deleted(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.side_effect = NoDataFoundException('no data') + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=3, + metadata=None, data=None) + vs.list_versions.side_effect = [[{'version': 2}, {'version': 1}], + [{'version': 3}, {'version': 2}], + ] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, ts1, metadata={1: 2}) + + assert vs.write.call_args_list == [call(sentinel.symbol, ANY, prune_previous_version=True, metadata={1: 2})] + assert vs.list_versions.call_args_list == [call(sentinel.symbol, latest_only=True), + call(sentinel.symbol)] + + +def test_ConcurrentWriteBlock_does_nothing_when_data_not_modified(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, metadata=None, data=None) + vs.list_versions.side_effect = [{'version': 2}, {'version': 1}] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]})) + + assert not vs._delete_version.called + assert not vs.write.called + + +def test_ConcurrentWriteBlock_does_nothing_when_data_is_None(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, + metadata=None, data=None) + vs.list_versions.return_value = [{'version': 1}, {'version': 2}] + + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + pass + assert not vs._delete_version.called + assert not vs.write.called + + +def test_ConcurrentWriteBlock_guards_against_inconsistent_ts(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, metadata=None, data=None) + vs.list_versions.side_effect = [{'version': 2}, {'version': 1}] + + ts1 = pd.DataFrame(index=[1, 2], data={'a': [2.0, 3.0]}) + with pytest.raises(ConcurrentModificationException): + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log, modify_timeseries=ts1) as cwb: + pass + + +def test_ConcurrentWriteBlock_detects_concurrent_writes(): + vs = create_autospec(VersionStore, _collection=Mock()) + ts1 = pd.DataFrame(index=[1, 2], data={'a':[1.0, 2.0]}) + vs.read.return_value = VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=1, metadata=None, data=ts1) + vs.write.side_effect = [VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=2, metadata=None, data=None), + VersionedItem(symbol=sentinel.symbol, library=sentinel.library, version=3, metadata=None, data=None)] + #note that we return some extra version 5, it is possible that we have a write coming in after our own write that gets picked up + vs.list_versions.side_effect = [[{'version': 5}, {'version': 2}, {'version': 1}, ], + [{'version': 5}, {'version': 3}, {'version': 2}, {'version': 1}, ]] + from threading import Event, Thread + e1 = Event() + e2 = Event() + + def losing_writer(): + #will attempt to write version 2, should find that version 2 is there and it ends up writing version 3 + with pytest.raises(ArcticTransaction): + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, pd.DataFrame([1.0, 2.0], [3, 4])) + e1.wait() + + def winning_writer(): + #will attempt to write version 2 as well + with ArcticTransaction(vs, sentinel.symbol, sentinel.user, sentinel.log) as cwb: + cwb.write(sentinel.symbol, pd.DataFrame([1.0, 2.0], [5, 6])) + e2.wait() + + t1 = Thread(target=losing_writer) + t2 = Thread(target=winning_writer) + t1.start() + t2.start() + + # both read the same timeseries and are locked doing some 'work' + e2.set() + # t2 should now be able to finish + t2.join() + e1.set() + t1.join() + + # we're expecting the losing_writer to undo its write once it realises that it wrote v3 instead of v2 + vs._delete_version.assert_called_once_with(sentinel.symbol, 3) diff --git a/tests/unit/store/test_version_store_utils.py b/tests/unit/store/test_version_store_utils.py new file mode 100644 index 000000000..7834e56d2 --- /dev/null +++ b/tests/unit/store/test_version_store_utils.py @@ -0,0 +1,17 @@ +import pytest +import numpy as np + +from arctic.store._version_store_utils import _split_arrs + + +def test_split_arrs_empty(): + split = _split_arrs(np.empty(0), []) + assert np.all(split == np.empty(0, dtype=np.object)) + + +def test_split_arrs(): + to_split = np.ones(10) + split = _split_arrs(to_split, [3]) + assert len(split) == 2 + assert np.all(split[0] == np.ones(3)) + assert np.all(split[1] == np.ones(7)) diff --git a/tests/unit/test_arctic.py b/tests/unit/test_arctic.py new file mode 100644 index 000000000..8e8ca8425 --- /dev/null +++ b/tests/unit/test_arctic.py @@ -0,0 +1,333 @@ +import cPickle as pickle +from mock import patch, MagicMock, sentinel, create_autospec, Mock, call +import pytest +from pymongo.errors import OperationFailure +from pymongo.mongo_client import MongoClient + +from arctic.auth import Credential +from arctic.arctic import Arctic, ArcticLibraryBinding, \ + register_library_type, LIBRARY_TYPES +from arctic.exceptions import LibraryNotFoundException, \ + ArcticException, QuotaExceededException + + +def test_arctic_lazy_init(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True) as mc, \ + patch('arctic.arctic.mongo_retry', side_effect=lambda x:x, autospec=True), \ + patch('arctic.arctic.get_auth', autospec=True) as ga: + store = Arctic('cluster') + assert not mc.called + # do something to trigger lazy arctic init + store.list_libraries() + assert mc.called + + +def test_arctic_auth(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True), \ + patch('arctic.arctic.mongo_retry', autospec=True), \ + patch('arctic.arctic.get_auth', autospec=True) as ga: + ga.return_value = Credential('db', 'admin_user', 'admin_pass') + store = Arctic('cluster') + # do something to trigger lazy arctic init + store.list_libraries() + ga.assert_called_once_with('cluster', 'arctic', 'admin') + store._adminDB.authenticate.assert_called_once_with('admin_user', 'admin_pass') + ga.reset_mock() + + # Get a 'missing' library + with pytest.raises(LibraryNotFoundException): + with patch('arctic.arctic.ArcticLibraryBinding.get_library_type', return_value=None, autospec=True): + ga.return_value = Credential('db', 'user', 'pass') + store._conn['arctic_jblackburn'].name = 'arctic_jblackburn' + store['jblackburn.library'] + + # Creating the library will have attempted to auth against it + ga.assert_called_once_with('cluster', 'arctic', 'arctic_jblackburn') + store._conn['arctic_jblackburn'].authenticate.assert_called_once_with('user', 'pass') + + +def test_arctic_auth_custom_app_name(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True), \ + patch('arctic.arctic.mongo_retry', autospec=True), \ + patch('arctic.arctic.get_auth', autospec=True) as ga: + ga.return_value = Credential('db', 'admin_user', 'admin_pass') + store = Arctic('cluster', app_name=sentinel.app_name) + # do something to trigger lazy arctic init + store.list_libraries() + assert ga.call_args_list == [call('cluster', sentinel.app_name, 'admin')] + ga.reset_mock() + + # Get a 'missing' library + with pytest.raises(LibraryNotFoundException): + with patch('arctic.arctic.ArcticLibraryBinding.get_library_type', return_value=None, autospec=True): + ga.return_value = Credential('db', 'user', 'pass') + store._conn['arctic_jblackburn'].name = 'arctic_jblackburn' + store['jblackburn.library'] + + # Creating the library will have attempted to auth against it + assert ga.call_args_list == [call('cluster', sentinel.app_name, 'arctic_jblackburn')] + + +def test_arctic_connect_hostname(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True) as mc, \ + patch('arctic.arctic.mongo_retry', autospec=True) as ar, \ + patch('arctic.arctic.get_mongodb_uri', autospec=True) as gmu: + store = Arctic('hostname', socketTimeoutMS=sentinel.socket_timeout, + connectTimeoutMS=sentinel.connect_timeout, + serverSelectionTimeoutMS=sentinel.select_timeout) + # do something to trigger lazy arctic init + store.list_libraries() + ar(mc).assert_called_once_with(host=gmu('hostname'), maxPoolSize=4, + socketTimeoutMS=sentinel.socket_timeout, + connectTimeoutMS=sentinel.connect_timeout, + serverSelectionTimeoutMS=sentinel.select_timeout) + + +def test_arctic_connect_with_environment_name(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True) as mc, \ + patch('arctic.arctic.mongo_retry', autospec=True) as ar, \ + patch('arctic.arctic.get_auth', autospec=True), \ + patch('arctic.arctic.get_mongodb_uri') as gmfe: + store = Arctic('live', socketTimeoutMS=sentinel.socket_timeout, + connectTimeoutMS=sentinel.connect_timeout, + serverSelectionTimeoutMS=sentinel.select_timeout) + # do something to trigger lazy arctic init + store.list_libraries() + assert gmfe.call_args_list == [call('live')] + assert ar(mc).call_args_list == [call(host=gmfe.return_value, maxPoolSize=4, + socketTimeoutMS=sentinel.socket_timeout, + connectTimeoutMS=sentinel.connect_timeout, + serverSelectionTimeoutMS=sentinel.select_timeout)] + + +@pytest.mark.parametrize( + ["library", "expected_library", "expected_database"], [ + ('library', 'library', 'arctic'), + ('user.library', 'library', 'arctic_user'), + ]) +def test_database_library_specifier(library, expected_library, expected_database): + mongo = MagicMock() + with patch('arctic.arctic.ArcticLibraryBinding._auth'): + ml = ArcticLibraryBinding(mongo, library) + + assert ml.library == expected_library + mongo._conn.__getitem__.assert_called_with(expected_database) + + +def test_arctic_repr(): + with patch('pymongo.MongoClient', return_value=MagicMock(), autospec=True): + with patch('arctic.arctic.mongo_retry', autospec=True): + with patch('arctic.arctic.get_auth', autospec=True) as ga: + ga.return_value = Credential('db', 'admin_user', 'admin_pass') + store = Arctic('cluster') + assert str(store) == repr(store) + + +def test_lib_repr(): + mongo = MagicMock() + with patch('arctic.arctic.ArcticLibraryBinding._auth'): + ml = ArcticLibraryBinding(mongo, 'asdf') + assert str(ml) == repr(ml) + + +def test_register_library_type(): + class DummyType(object): + pass + register_library_type("new_dummy_type", DummyType) + assert LIBRARY_TYPES['new_dummy_type'] == DummyType + + with pytest.raises(ArcticException) as e: + register_library_type("new_dummy_type", DummyType) + assert "ArcticException: Library new_dummy_type already registered as " in str(e) + + +def test_set_quota(): + self = create_autospec(ArcticLibraryBinding) + ArcticLibraryBinding.set_quota(self, 10000) + self.set_library_metadata.assert_called_once_with('QUOTA', 10000) + assert self.quota_countdown == 0 + assert self.quota == 10000 + + +def test_get_quota(): + self = create_autospec(ArcticLibraryBinding) + self.get_library_metadata.return_value = 42 + assert ArcticLibraryBinding.get_quota(self) == 42 + self.get_library_metadata.assert_called_once_with('QUOTA') + + +def test_check_quota_Zero(): + self = create_autospec(ArcticLibraryBinding) + self.quota = 0 + ArcticLibraryBinding.check_quota(self) + + +def test_check_quota_None(): + self = create_autospec(ArcticLibraryBinding) + self.quota = None + self.get_library_metadata.return_value = None + ArcticLibraryBinding.check_quota(self) + self.get_library_metadata.assert_called_once_with('QUOTA') + assert self.quota == 0 + + +def test_check_quota_Zero2(): + self = create_autospec(ArcticLibraryBinding) + self.quota = None + self.get_library_metadata.return_value = 0 + ArcticLibraryBinding.check_quota(self) + self.get_library_metadata.assert_called_once_with('QUOTA') + assert self.quota == 0 + + +def test_check_quota_countdown(): + self = create_autospec(ArcticLibraryBinding) + self.quota = 10 + self.quota_countdown = 10 + ArcticLibraryBinding.check_quota(self) + assert self.quota_countdown == 9 + + +def test_check_quota(): + self = create_autospec(ArcticLibraryBinding) + self.arctic = create_autospec(Arctic) + self.quota = 1024 * 1024 * 1024 + self.quota_countdown = 0 + self.arctic.__getitem__.return_value = Mock(stats=Mock(return_value={'totals': + {'size': 900 * 1024 * 1024, + 'count': 100, + } + })) + with patch('arctic.logging.logger.warn') as warn: + ArcticLibraryBinding.check_quota(self) + self.arctic.__getitem__.assert_called_once_with(self.get_name.return_value) + warn.assert_called_once_with('Mongo Quota: 0.879 / 1 GB used') + assert self.quota_countdown == 6 + + +def test_check_quota_info(): + self = create_autospec(ArcticLibraryBinding) + self.arctic = create_autospec(Arctic) + self.quota = 1024 * 1024 * 1024 + self.quota_countdown = 0 + self.arctic.__getitem__.return_value = Mock(stats=Mock(return_value={'totals': + {'size': 1 * 1024 * 1024, + 'count': 100, + } + })) + with patch('arctic.logging.logger.info') as info: + ArcticLibraryBinding.check_quota(self) + self.arctic.__getitem__.assert_called_once_with(self.get_name.return_value) + info.assert_called_once_with('Mongo Quota: 0.001 / 1 GB used') + assert self.quota_countdown == 51153 + + +def test_check_quota_exceeded(): + self = create_autospec(ArcticLibraryBinding) + self.arctic = create_autospec(Arctic) + self.quota = 1024 * 1024 * 1024 + self.quota_countdown = 0 + self.arctic.__getitem__.return_value = Mock(stats=Mock(return_value={'totals': + {'size': 1024 * 1024 * 1024, + 'count': 100, + } + })) + with pytest.raises(QuotaExceededException) as e: + ArcticLibraryBinding.check_quota(self) + assert "Quota Exceeded: 1.000 / 1 GB used" in str(e) + + +def test_initialize_library(): + self = create_autospec(Arctic) + self._conn = create_autospec(MongoClient) + lib = create_autospec(ArcticLibraryBinding) + lib.database_name = sentinel.db_name + lib.get_quota.return_value = None + lib_type = Mock() + with patch.dict('arctic.arctic.LIBRARY_TYPES', {sentinel.lib_type: lib_type}), \ + patch('arctic.arctic.ArcticLibraryBinding', return_value=lib, autospec=True) as ML: + Arctic.initialize_library(self, sentinel.lib_name, sentinel.lib_type, thing=sentinel.thing) + assert ML.call_args_list == [call(self, sentinel.lib_name)] + assert ML.return_value.set_library_type.call_args_list == [call(sentinel.lib_type)] + assert ML.return_value.set_quota.call_args_list == [call(10 * 1024 * 1024 * 1024)] + assert lib_type.initialize_library.call_args_list == [call(ML.return_value, thing=sentinel.thing)] + + +def test_initialize_library_too_many_ns(): + self = create_autospec(Arctic) + self._conn = create_autospec(MongoClient) + lib = create_autospec(ArcticLibraryBinding) + lib.database_name = sentinel.db_name + self._conn.__getitem__.return_value.collection_names.return_value = [x for x in xrange(3001)] + lib_type = Mock() + with pytest.raises(ArcticException) as e: + with patch.dict('arctic.arctic.LIBRARY_TYPES', {sentinel.lib_type: lib_type}), \ + patch('arctic.arctic.ArcticLibraryBinding', return_value=lib, autospec=True) as ML: + Arctic.initialize_library(self, sentinel.lib_name, sentinel.lib_type, thing=sentinel.thing) + assert self._conn.__getitem__.call_args_list == [call(sentinel.db_name), + call(sentinel.db_name)] + assert lib_type.initialize_library.call_count == 0 + assert 'Too many namespaces 3001, not creating: sentinel.lib_name' in str(e) + + +def test_get_library(): + self = create_autospec(Arctic) + self._library_cache = {} + library_type = Mock() + register_library_type(sentinel.lib_type, library_type) + with patch('arctic.arctic.ArcticLibraryBinding', autospec=True) as ML: + ML.return_value.get_library_type.return_value = sentinel.lib_type + library = Arctic.get_library(self, sentinel.lib_name) + del LIBRARY_TYPES[sentinel.lib_type] + assert ML.call_args_list == [call(self, sentinel.lib_name)] + assert library_type.call_args_list == [call(ML.return_value)] + assert library == library_type.return_value + + +def test_get_library_not_initialized(): + self = create_autospec(Arctic, + mongo_host=sentinel.host) + self._library_cache = {} + with pytest.raises(LibraryNotFoundException) as e, \ + patch('arctic.arctic.ArcticLibraryBinding', autospec=True) as ML: + ML.return_value.get_library_type.return_value = None + Arctic.get_library(self, sentinel.lib_name) + assert "Library %s was not correctly initialized in %s." % (sentinel.lib_name, self) in str(e) + + +def test_get_library_auth_issue(): + self = create_autospec(Arctic, + mongo_host=sentinel.host) + self._library_cache = {} + with pytest.raises(LibraryNotFoundException) as e, \ + patch('arctic.arctic.ArcticLibraryBinding', autospec=True) as ML: + ML.return_value.get_library_type.side_effect = OperationFailure('database error: not authorized for query on arctic_marketdata.index.ARCTIC') + Arctic.get_library(self, sentinel.lib_name) + assert "Library %s was not correctly initialized in %s." % (sentinel.lib_name, self) in str(e) + + +def test_get_library_not_registered(): + self = create_autospec(Arctic) + self._library_cache = {} + with pytest.raises(LibraryNotFoundException) as e, \ + patch('arctic.arctic.ArcticLibraryBinding', autospec=True) as ML: + ML.return_value.get_library_type.return_value = sentinel.lib_type + Arctic.get_library(self, sentinel.lib_name) + assert ("Couldn't load LibraryType '%s' for '%s' (has the class been registered?)" % + (sentinel.lib_type, sentinel.lib_name) + )in str(e) + + +def test_mongo_host_get_set(): + sentinel.mongo_host = Mock(nodes={("host", "port")}) + arctic = Arctic(sentinel.mongo_host) + assert arctic.mongo_host == "host:port" + + +def test_arctic_set_get_state(): + sentinel.mongo_host = Mock(nodes={("host", "port")}) + store = Arctic(sentinel.mongo_host, allow_secondary="allow_secondary") + buff = pickle.dumps(store) + mnew = pickle.loads(buff) + assert mnew.mongo_host == "host:port" + assert mnew._allow_secondary == "allow_secondary" diff --git a/tests/unit/test_auth.py b/tests/unit/test_auth.py new file mode 100644 index 000000000..fe23d2659 --- /dev/null +++ b/tests/unit/test_auth.py @@ -0,0 +1,17 @@ +from mock import create_autospec, sentinel +from pymongo.database import Database +from pymongo.errors import PyMongoError + +from arctic import auth + + +def test_authenticate(): + db = create_autospec(Database) + db.authenticate.return_value = sentinel.ret + assert auth.authenticate(db, sentinel.user, sentinel.password) == sentinel.ret + + +def test_authenticate_fails(): + db = create_autospec(Database) + db.authenticate.side_effect = PyMongoError("error") + assert auth.authenticate(db, sentinel.user, sentinel.password) is False diff --git a/tests/unit/test_compress.py b/tests/unit/test_compress.py new file mode 100644 index 000000000..69aebbf51 --- /dev/null +++ b/tests/unit/test_compress.py @@ -0,0 +1,72 @@ +import lz4 +import pytest +import random +import string + +import arctic._compress as c + + +def test_roundtrip(): + _str = "hello world" + cstr = c.compress(_str) + assert _str == c.decompress(cstr) + + +@pytest.mark.parametrize("n", [1, 1e2, 1e3, 1e6]) +def test_roundtrip_multi(n): + _str = random_string(n) + cstr = c.compress(_str) + assert _str == c.decompress(cstr) + + +def test_roundtripHC(): + _str = "hello world" + cstr = c.compressHC(_str) + assert _str == c.decompress(cstr) + + +def test_roundtripLZ4(): + _str = "hello world" + cstr = lz4.compress(_str) + assert _str == c.decompress(cstr) + + +def test_roundtripLZ4Back(): + _str = "hello world" + cstr = c.compress(_str) + assert _str == lz4.decompress(cstr) + + +def test_roundtripLZ4HC(): + _str = "hello world" + cstr = lz4.compressHC(_str) + assert _str == c.decompress(cstr) + + +def test_roundtripLZ4HCBack(): + _str = "hello world" + cstr = c.compressHC(_str) + assert _str == lz4.decompress(cstr) + + +@pytest.mark.parametrize("n, length", [(1, 10), (100, 10), (1000, 10)]) +def test_roundtrip_arr(n, length): + _strarr = [random_string(length) for _ in range(n)] + cstr = c.compressarr(_strarr) + assert _strarr == c.decompressarr(cstr) + + +@pytest.mark.parametrize("n, length", [(1, 10), (100, 10), (1000, 10)]) +def test_roundtrip_arrHC(n, length): + _strarr = [random_string(length) for _ in range(n)] + cstr = c.compressarrHC(_strarr) + assert _strarr == c.decompressarr(cstr) + + +def test_arr_zero(): + assert [] == c.compressarrHC([]) + assert [] == c.decompressarr([]) + + +def random_string(N): + return ''.join(random.choice(string.printable) for _ in range(int(N))) diff --git a/tests/unit/test_compression.py b/tests/unit/test_compression.py new file mode 100644 index 000000000..8aef0bb25 --- /dev/null +++ b/tests/unit/test_compression.py @@ -0,0 +1,101 @@ +from mock import patch, Mock + +from arctic._compression import use_lz4hc, _should_use_lz4hc, _is_interactive_mode, compress, compress_array, decompress, decompress_array +from arctic import _compression + + +def teardown_function(function): + _compression.USE_LZ4HC = True + + +def test_use_lz4hc(): + use_lz4hc(True) + assert _compression.USE_LZ4HC is True + use_lz4hc(False) + assert _compression.USE_LZ4HC is False + + +def test_use_lz4hc_True(): + use_lz4hc(True) + assert _should_use_lz4hc() is True + + +def test_use_lz4hc_False(): + use_lz4hc(False) + assert _should_use_lz4hc() is False + + +def test__is_interactive_mode(): + assert _is_interactive_mode() is False # in a test! + + +def test_compress(): + assert len(compress("foobar")) > 0 + + +def test_compress_LZ4HC(): + use_lz4hc(True) + cfn = Mock() + with patch('arctic._compression.clz4.compressHC', cfn): + compress("foo") + assert cfn.call_count == 1 + + +def test_compress_LZ4(): + use_lz4hc(False) + cfn = Mock() + with patch('arctic._compression.clz4.compress', cfn): + compress("foo") + assert cfn.call_count == 1 + + +def test_compressarr(): + assert len(compress_array(["foobar"*10])) > 0 + assert isinstance(compress_array(["foobar"*10]), list) + + +def test_compressarr_LZ4HC(): + assert len(compress_array(["foobar"*10])) > 0 + assert isinstance(compress_array(["foobar"*10]), list) + + +def test_compress_array_usesLZ4HC(): + use_lz4hc(True) + cfn = Mock() + with patch('arctic._compression.clz4.compressarrHC', cfn): + compress_array(["foo"] * 100) + assert cfn.call_count == 1 + + +def test_compress_array_usesLZ4(): + use_lz4hc(False) + cfn = Mock() + with patch('arctic._compression.clz4.compressarr', cfn): + compress_array(["foo"] * 100) + assert cfn.call_count == 1 + + +def test_compress_array_LZ4HC_sequential(): + use_lz4hc(True) + cfn = Mock() + with patch('arctic._compression.clz4.compressHC', cfn): + compress_array(["foo"] * 4) + assert cfn.call_count == 4 + + +def test_compress_array_LZ4_sequential(): + use_lz4hc(False) + cfn = Mock() + with patch('arctic._compression.clz4.compress', cfn): + compress_array(["foo"] * 49) + assert cfn.call_count == 49 + + +def test_decompress(): + assert decompress(compress("foo")) == "foo" + + +def test_decompress_array(): + ll = ['foo%s' % i for i in range(100)] + assert decompress_array(compress_array(ll)) == ll + diff --git a/tests/unit/test_decorators_unit.py b/tests/unit/test_decorators_unit.py new file mode 100644 index 000000000..7085e8cfd --- /dev/null +++ b/tests/unit/test_decorators_unit.py @@ -0,0 +1,162 @@ +from mock import patch, create_autospec, sentinel, Mock, PropertyMock, MagicMock +import pytest +from pymongo.errors import AutoReconnect, OperationFailure, DuplicateKeyError, ServerSelectionTimeoutError +from pymongo.read_preferences import ReadPreference + +from arctic import decorators +from arctic.decorators import mongo_retry, _get_host +from pymongo.collection import Collection + + +def test_mongo_retry(): + retries = [2] + self = MagicMock() + self._arctic_lib.arctic.mongo_host = sentinel.host + self._collection.database.client.nodes = set([('a', 12)]) + self._arctic_lib.get_name.return_value = sentinel.lib_name + with patch('arctic.decorators._handle_error', autospec=True) as he: + @mongo_retry + def foo(self): + if retries[0] == 2: + retries[0] -= 1 + raise OperationFailure('error') + elif retries[0] == 1: + retries[0] -= 1 + raise AutoReconnect('error') + return "success" + foo(self) + assert he.call_count == 2 + assert isinstance(he.call_args_list[0][0][1], OperationFailure) + assert he.call_args_list[0][0][2] == 1 + assert he.call_args_list[0][1] == {'mnodes': ['a:12'], + 'mhost': 'sentinel.host', + 'l': sentinel.lib_name} + assert isinstance(he.call_args_list[1][0][1], AutoReconnect) + assert he.call_args_list[1][0][2] == 2 + + +def test_mongo_retry_fails(): + error = OperationFailure('error') + retries = [16] + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + if retries[0]: + retries[0] -= 1 + raise error + return "success" + with pytest.raises(OperationFailure): + foo() + assert le.call_count == 15 + assert le.call_args[0][0] == 'foo' + assert le.call_args[0][1] == error + + +def test_retry_nested(): + error = OperationFailure('error') + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + @mongo_retry + def bar(): + raise error + try: + bar() + except: + raise error + with pytest.raises(OperationFailure): + foo() + assert le.call_count == 15 + assert le.call_args[0][0] == 'bar' + assert le.call_args[0][1] == error + + +def test_all_other_exceptions_logged(): + with patch('arctic.decorators._log_exception', autospec=True) as le: + def foo(): + raise Exception("Unexpected Error") + foo.__module__ = 'arctic.foo' + foo = mongo_retry(foo) + with pytest.raises(Exception) as e: + foo() + assert "Unexpected Error" in str(e) + assert le.call_count == 1 + assert le.call_args[0][0] == "foo" + + +def test_other_exceptions_not_logged_outside_of_arctic(): + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + raise Exception("Unexpected Error") + with pytest.raises(Exception) as e: + foo() + assert "Unexpected Error" in str(e) + assert le.call_count == 0 + + +@pytest.mark.xfail(reason="CS-8393 Mongo server reports auth failure when servers flip") +def test_auth_failure_no_retry(): + error = OperationFailure('unauthorized for db:arctic_jblackburn') + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + raise error + with pytest.raises(OperationFailure) as e: + foo() + assert 'OperationFailure: unauthorized for db:arctic_jblackburn' in str(e) + assert le.call_count == 1 + + +def test_duplicate_key_failure_no_retry(): + error = DuplicateKeyError('duplicate key') + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + raise error + with pytest.raises(OperationFailure) as e: + foo() + assert 'duplicate key' in str(e) + assert le.call_count == 1 + + +def test_ServerSelectionTimeoutError_no_retry(): + error = ServerSelectionTimeoutError('some error') + with patch('arctic.decorators._log_exception', autospec=True) as le: + @mongo_retry + def foo(): + raise error + with pytest.raises(ServerSelectionTimeoutError) as e: + foo() + assert 'some error' in str(e) + assert le.call_count == 1 + + +def test_get_host(): + store = Mock() + store._arctic_lib.arctic.mongo_host = sentinel.host + store._collection.database.client.nodes = set([('a', 12)]) + store._arctic_lib.get_name.return_value = sentinel.lib_name + assert _get_host(store) == {'mhost': 'sentinel.host', + 'mnodes': ['a:12'], + 'l': sentinel.lib_name, + } + + +def test_get_host_list(): + store = Mock() + store._arctic_lib.arctic.mongo_host = sentinel.host + store._collection.database.client.nodes = set([('a', 12)]) + store._arctic_lib.get_name.return_value = sentinel.lib_name + assert _get_host([store]) == {'mhost': 'sentinel.host', + 'mnodes': ['a:12'], + 'l': sentinel.lib_name, + } + + +def test_get_host_not_a_vs(): + store = MagicMock() + store._arctic_lib.get_name.side_effect = AttributeError("Hello") + assert _get_host(store) == {} + store._arctic_lib.get_name.side_effect = ValueError("Hello") + assert _get_host(store) == {} diff --git a/tests/unit/test_hosts.py b/tests/unit/test_hosts.py new file mode 100644 index 000000000..a3435d889 --- /dev/null +++ b/tests/unit/test_hosts.py @@ -0,0 +1,36 @@ +from mock import patch, sentinel, call, PropertyMock, Mock +import os +import pytest + +from ConfigParser import NoSectionError +from arctic.hosts import get_arctic_lib + + +def test_get_arctic_lib_with_known_host(): + with patch('arctic.arctic.Arctic') as Arctic: + get_arctic_lib("foo@bar") + assert Arctic.call_args_list == [call('bar')] + + +def test_get_arctic_lib_with_unknown_host(): + with patch('arctic.arctic.Arctic') as Arctic: + with patch('pymongo.MongoClient') as MongoClient: + get_arctic_lib("foo@bar:123") + assert Arctic.call_args_list == [call("bar:123")] + + +def test_get_arctic_connection_strings(): + with patch('arctic.arctic.Arctic') as Arctic: + with patch('pymongo.MongoClient') as MongoClient: + get_arctic_lib("foo@bar") + get_arctic_lib("foo.sheep@bar") + get_arctic_lib("foo.sheep@bar:123") + get_arctic_lib("foo.sheep@127.0.0.1:123") + + +@pytest.mark.parametrize( + ["string"], [('donkey',), ('donkey:ride@blackpool',), + ('donkey:ride',)]) +def test_get_arctic_malformed_connection_strings(string): + with pytest.raises(ValueError): + get_arctic_lib(string) diff --git a/tests/unit/tickstore/__init__.py b/tests/unit/tickstore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/tickstore/test_toplevel.py b/tests/unit/tickstore/test_toplevel.py new file mode 100644 index 000000000..946f4ecab --- /dev/null +++ b/tests/unit/tickstore/test_toplevel.py @@ -0,0 +1,147 @@ +from mock import Mock, patch, MagicMock, create_autospec, sentinel +import pytest +from datetime import datetime as dt +import pandas as pd +from pandas.util.testing import assert_frame_equal +import numpy as np +from mockextras import when + +from arctic.date import DateRange, mktz +from arctic.exceptions import OverlappingDataException +from arctic.tickstore.toplevel import TopLevelTickStore, TickStoreLibrary +from dateutil.rrule import rrule, DAILY + + +def test_raise_exception_if_daterange_is_not_provided(): + store = TopLevelTickStore(Mock()) + with pytest.raises(Exception) as e: + store._get_library_metadata(None) + assert "A date range must be provided" in str(e) + + +def test_raise_exception_if_date_range_does_not_contain_start_date(): + store = TopLevelTickStore(Mock()) + dr = DateRange(start=None, end=dt(2011, 1, 1)) + with pytest.raises(Exception) as e: + store._get_library_metadata(dr) + assert "The date range {0} must contain a start and end date".format(dr) in str(e) + + +def test_raise_exception_if_date_range_does_not_contain_end_date(): + store = TopLevelTickStore(Mock()) + dr = DateRange(start=dt(2011, 1, 1), end=None) + with pytest.raises(Exception) as e: + store._get_library_metadata(dr) + assert "The date range {0} must contain a start and end date".format(dr) in str(e) + + +def test_raise_exception_if_date_range_does_not_contain_start_and_end_date(): + store = TopLevelTickStore(Mock()) + dr = DateRange(start=None, end=None) + with pytest.raises(Exception) as e: + store._get_library_metadata(dr) + assert "The date range {0} must contain a start and end date".format(dr) in str(e) + + +def test_raise_exception_and_log_an_error_if_an_invalid_library_name_is_added(): + arctic_lib = MagicMock() + arctic_lib.arctic.__getitem__.side_effect = Exception() + store = TopLevelTickStore(arctic_lib) + with patch("arctic.tickstore.toplevel.logger") as mock_logger: + with pytest.raises(Exception): + store.add(None, "blah") + mock_logger.error.assert_called_once_with("Could not load library") + + +def test_raise_exception_if_date_range_overlaps(): + self = create_autospec(TopLevelTickStore, _arctic_lib=MagicMock()) + self._get_library_metadata.return_value = [TickStoreLibrary('lib1', None), ] + with pytest.raises(OverlappingDataException) as e: + TopLevelTickStore.add(self, DateRange(start=dt(2010, 1, 1), end=dt(2011, 1, 1, 23, 59, 59, 999000)), "blah") + assert "There are libraries that overlap with the date range:" in str(e) + + +@pytest.mark.parametrize(('start', 'end', 'expected_start', 'expected_end'), + [(dt(2010, 1, 1, tzinfo=mktz('UTC')), dt(2010, 12, 31, 23, 59, 59, 999000, tzinfo=mktz('UTC')), + dt(2010, 1, 1, tzinfo=mktz('UTC')), dt(2010, 12, 31, 23, 59, 59, 999000, tzinfo=mktz('UTC'))), + (dt(2010, 1, 1), dt(2010, 12, 31, 23, 59, 59, 999000), dt(2010, 1, 1, tzinfo=mktz('UTC')), + dt(2010, 12, 31, 23, 59, 59, 999000, tzinfo=mktz('UTC'))), + (dt(2009, 12, 31, 19, tzinfo=mktz('America/New_York')), dt(2010, 12, 31, 18, 59, 59, 999000, tzinfo=mktz('America/New_York')), + dt(2010, 1, 1, tzinfo=mktz('UTC')), dt(2010, 12, 31, 23, 59, 59, 999000, tzinfo=mktz('UTC'))) + ]) +def test_add_library_to_colllection_if_date_range_is_on_UTC_or_naive_day_boundaries(start, end, expected_start, expected_end): + self = create_autospec(TopLevelTickStore, _arctic_lib=MagicMock(), _collection=MagicMock()) + self._get_library_metadata.return_value = [] + TopLevelTickStore.add(self, DateRange(start=start, end=end), "blah") + self._collection.update_one.assert_called_once_with({'library_name': "blah"}, + {'$set': + {'start': expected_start, + 'end': expected_end}}, upsert=True) + + +@pytest.mark.parametrize(('start', 'end'), + [(dt(2010, 1, 1, 2, tzinfo=mktz('UTC')), dt(2011, 1, 1, tzinfo=mktz('UTC'))), + (dt(2010, 1, 1, tzinfo=mktz('UTC')), dt(2011, 1, 1, 2, tzinfo=mktz('UTC'))), + (dt(2010, 1, 1, 2, tzinfo=mktz('UTC')), dt(2011, 1, 1, 2, tzinfo=mktz('UTC'))), + (dt(2010, 1, 1, 2), dt(2011, 1, 1)), + (dt(2010, 1, 1), dt(2011, 1, 1, 2)), + (dt(2010, 1, 1, 2), dt(2011, 1, 1, 2)), + (dt(2009, 12, 31, 21, 10, tzinfo=mktz('America/New_York')), dt(2010, 12, 31, tzinfo=mktz('America/New_York'))), + (dt(2009, 12, 31, tzinfo=mktz('America/New_York')), dt(2010, 12, 31, tzinfo=mktz('America/New_York'))), + (dt(2009, 12, 31, 21, 10, tzinfo=mktz('America/New_York')), dt(2010, 12, 31, 9, 21, tzinfo=mktz('America/New_York'))) + ]) +def test_raise_error_add_library_is_called_with_a_date_range_not_on_day_boundaries(start, end): + with pytest.raises(AssertionError) as e: + self = create_autospec(TopLevelTickStore, _arctic_lib=MagicMock(), _collection=MagicMock()) + self._get_library_metadata.return_value = [] + TopLevelTickStore.add(self, DateRange(start=start, end=end), "blah") + assert "Date range should fall on UTC day boundaries" in str(e) + + +@pytest.mark.parametrize(('start', 'end', 'expected_start_index', 'expected_end_index'), + [(dt(2010, 1, 1), dt(2010, 1, 5), 0, 3), + (dt(2010, 1, 1), dt(2010, 1, 6), 0, 3), + (dt(2010, 1, 1, 1), dt(2010, 1, 6), 1, 3), + (dt(2010, 1, 1, 1), dt(2010, 1, 4, 2), 1, 2), + (dt(2009, 1, 1), dt(2010, 1, 5), 0, 3), + ]) +def test_slice_pandas_dataframe(start, end, expected_start_index, expected_end_index): + top_level_tick_store = TopLevelTickStore(Mock()) + dates = pd.date_range('20100101', periods=5, freq='2D') + data = pd.DataFrame(np.random.randn(5, 4), index=dates, columns=list('ABCD')) + expected = data.ix[expected_start_index:expected_end_index] + result = top_level_tick_store._slice(data, start, end) + assert_frame_equal(expected, result), '{}\n{}'.format(expected, result) + + +@pytest.mark.parametrize(('start', 'end', 'expected_start_index', 'expected_end_index'), + [(dt(2010, 1, 1), dt(2010, 1, 5), 0, 3), + (dt(2010, 1, 1), dt(2010, 1, 6), 0, 3), + (dt(2010, 1, 1, 1), dt(2010, 1, 6), 1, 3), + (dt(2010, 1, 1, 1), dt(2010, 1, 4, 2), 1, 2), + (dt(2009, 1, 1), dt(2010, 1, 5), 0, 3), + ]) +def test_slice_list_of_dicts(start, end, expected_start_index, expected_end_index): + top_level_tick_store = TopLevelTickStore(Mock()) + dates = list(rrule(DAILY, count=5, dtstart=dt(2010, 1, 1), interval=2)) + data = [{'index': date, 'A': val} for date, val in zip(dates, range(5))] + expected = data[expected_start_index:expected_end_index] + result = top_level_tick_store._slice(data, start, end) + assert expected == result + + +def test_write_pandas_data_to_right_libraries(): + self = create_autospec(TopLevelTickStore, _arctic_lib=MagicMock(), _collection=MagicMock()) + self._collection.find.return_value = [{'library_name': sentinel.libname1, 'start': sentinel.st1, 'end': sentinel.end1}, + {'library_name': sentinel.libname2, 'start': sentinel.st2, 'end': sentinel.end2}] + slice1 = range(2) + slice2 = range(4) + when(self._slice).called_with(sentinel.data, sentinel.st1, sentinel.end1).then(slice1) + when(self._slice).called_with(sentinel.data, sentinel.st2, sentinel.end2).then(slice2) + mock_lib1 = Mock() + mock_lib2 = Mock() + when(self._arctic_lib.arctic.__getitem__).called_with(sentinel.libname1).then(mock_lib1) + when(self._arctic_lib.arctic.__getitem__).called_with(sentinel.libname2).then(mock_lib2) + TopLevelTickStore.write(self, 'blah', sentinel.data) + mock_lib1.write.assert_called_once_with('blah', slice1) + mock_lib2.write.assert_called_once_with('blah', slice2) diff --git a/tests/util.py b/tests/util.py new file mode 100644 index 000000000..c1b25b751 --- /dev/null +++ b/tests/util.py @@ -0,0 +1,51 @@ +from contextlib import contextmanager +from cStringIO import StringIO +from dateutil.rrule import rrule, DAILY +import dateutil +from datetime import datetime as dt +import pandas +import numpy as np +import sys + + +def read_str_as_pandas(ts_str): + labels = [x.strip() for x in ts_str.split('\n')[0].split('|')] + pd = pandas.read_csv(StringIO(ts_str), sep='|', index_col=0, + date_parser=dateutil.parser.parse) + # Trim the whitespace on the column names + pd.columns = labels[1:] + pd.index.name = labels[0] + return pd + + +def get_large_ts(size=2500): + timestamps = list(rrule(DAILY, count=size, dtstart=dt(1970, 1, 1), interval=1)) + pd = pandas.DataFrame(index=timestamps, data={'n' + str(i): np.random.random_sample(size) for i in range(size)}) + pd.index.name = 'index' + return pd + + +@contextmanager +def _save_argv(): + args = sys.argv[:] + yield + sys.argv = args + + +def run_as_main(fn, *args): + """ Run a given function as if it was the + system entry point, eg for testing scripts. + + Eg:: + + from scripts.Foo import main + + run_as_main(main, 'foo','bar') + + This is equivalent to ``Foo foo bar``, assuming + ``scripts.Foo.main`` is registered as an entry point. + """ + with _save_argv(): + print("run_as_main: %s" % str(args)) + sys.argv = ['progname'] + list(args) + return fn()